I wan to reproduce the demo, but i got a error:
error: nvJitLinkComplete(handle) failed with error 4
error: ERROR 3: ptxjit:ltoPtx
ERROR NVJITLINK_ERROR_PTX_COMPILE: JIT the PTX (ltoPtx)
That error seems to be NVJITLINK_ERROR_PTX_COMPILE
It may be due to a corrupted environment, or a mismatch between compile options and target GPU.
The demo seems to work correctly for me. Here is a full test case, running on L4 GPU (cc8.9) on CUDA 12.2, Linux:
# cat offline.cu
__device__ float compute(float a, float x, float y) {
return a * x + y;
}
# cat online.cpp
#include <nvrtc.h>
#include <cuda.h>
#include <nvJitLink.h>
#include <nvrtc.h>
#include <iostream>
#define NUM_THREADS 128
#define NUM_BLOCKS 32
#define NVRTC_SAFE_CALL(x) \
do { \
nvrtcResult result = x; \
if (result != NVRTC_SUCCESS) { \
std::cerr << "\nerror: " #x " failed with error " \
<< nvrtcGetErrorString(result) << '\n'; \
exit(1); \
} \
} while(0)
#define CUDA_SAFE_CALL(x) \
do { \
CUresult result = x; \
if (result != CUDA_SUCCESS) { \
const char *msg; \
cuGetErrorName(result, &msg); \
std::cerr << "\nerror: " #x " failed with error " \
<< msg << '\n'; \
exit(1); \
} \
} while(0)
#define NVJITLINK_SAFE_CALL(h,x) \
do { \
nvJitLinkResult result = x; \
if (result != NVJITLINK_SUCCESS) { \
std::cerr << "\nerror: " #x " failed with error " \
<< result << '\n'; \
size_t lsize; \
result = nvJitLinkGetErrorLogSize(h, &lsize); \
if (result == NVJITLINK_SUCCESS && lsize > 0) { \
char *log = (char*)malloc(lsize); \
result = nvJitLinkGetErrorLog(h, log); \
if (result == NVJITLINK_SUCCESS) { \
std::cerr << "error: " << log << '\n'; \
free(log); \
} \
} \
exit(1); \
} \
} while(0)
const char *lto_saxpy = " \n\
extern __device__ float compute(float a, float x, float y); \n\
\n\
extern \"C\" __global__ \n\
void saxpy(float a, float *x, float *y, float *out, size_t n) \n\
{ \n\
size_t tid = blockIdx.x * blockDim.x + threadIdx.x; \n\
if (tid < n) { \n\
out[tid] = compute(a, x[tid], y[tid]); \n\
} \n\
} \n";
int main(int argc, char *argv[])
{
size_t numBlocks = 32;
size_t numThreads = 128;
// Create an instance of nvrtcProgram with the code string.
nvrtcProgram prog;
NVRTC_SAFE_CALL(
nvrtcCreateProgram(&prog, // prog
lto_saxpy, // buffer
"lto_saxpy.cu", // name
0, // numHeaders
NULL, // headers
NULL)); // includeNames
// specify that LTO IR should be generated for LTO operation
const char *opts[] = {"-dlto",
"--relocatable-device-code=true"};
nvrtcResult compileResult = nvrtcCompileProgram(prog, // prog
2, // numOptions
opts); // options
// Obtain compilation log from the program.
size_t logSize;
NVRTC_SAFE_CALL(nvrtcGetProgramLogSize(prog, &logSize));
char *log = new char[logSize];
NVRTC_SAFE_CALL(nvrtcGetProgramLog(prog, log));
std::cout << log << '\n';
delete[] log;
if (compileResult != NVRTC_SUCCESS) {
exit(1);
}
// Obtain generated LTO IR from the program.
size_t LTOIRSize;
NVRTC_SAFE_CALL(nvrtcGetLTOIRSize(prog, <OIRSize));
char *LTOIR = new char[LTOIRSize];
NVRTC_SAFE_CALL(nvrtcGetLTOIR(prog, LTOIR));
// Destroy the program.
NVRTC_SAFE_CALL(nvrtcDestroyProgram(&prog));
CUdevice cuDevice;
CUcontext context;
CUmodule module;
CUfunction kernel;
CUDA_SAFE_CALL(cuInit(0));
CUDA_SAFE_CALL(cuDeviceGet(&cuDevice, 0));
CUDA_SAFE_CALL(cuCtxCreate(&context, 0, cuDevice));
// Load the generated LTO IR and the LTO IR generated offline
// and link them together.
nvJitLinkHandle handle;
// Dynamically determine the arch to link for
int major = 0;
int minor = 0;
CUDA_SAFE_CALL(cuDeviceGetAttribute(&major,
CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevice));
CUDA_SAFE_CALL(cuDeviceGetAttribute(&minor,
CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevice));
int arch = major*10 + minor;
char smbuf[16];
sprintf(smbuf, "-arch=sm_%d", arch);
const char *lopts[] = {"-lto", smbuf};
NVJITLINK_SAFE_CALL(handle, nvJitLinkCreate(&handle, 2, lopts));
// NOTE: assumes "offline.fatbin" is in the current directory
// The fatbinary contains LTO IR generated offline using nvcc
NVJITLINK_SAFE_CALL(handle, nvJitLinkAddFile(handle, NVJITLINK_INPUT_FATBIN,
"offline.fatbin"));
NVJITLINK_SAFE_CALL(handle, nvJitLinkAddData(handle, NVJITLINK_INPUT_LTOIR,
(void *)LTOIR, LTOIRSize, "lto_online"));
// The call to nvJitLinkComplete causes linker to link together the two
// LTO IR modules (offline and online), do optimization on the linked LTO IR,
// and generate cubin from it.
NVJITLINK_SAFE_CALL(handle, nvJitLinkComplete(handle));
size_t cubinSize;
NVJITLINK_SAFE_CALL(handle, nvJitLinkGetLinkedCubinSize(handle, &cubinSize));
void *cubin = malloc(cubinSize);
NVJITLINK_SAFE_CALL(handle, nvJitLinkGetLinkedCubin(handle, cubin));
NVJITLINK_SAFE_CALL(handle, nvJitLinkDestroy(&handle));
CUDA_SAFE_CALL(cuModuleLoadData(&module, cubin));
CUDA_SAFE_CALL(cuModuleGetFunction(&kernel, module, "saxpy"));
// Generate input for execution, and create output buffers.
size_t n = NUM_THREADS * NUM_BLOCKS;
size_t bufferSize = n * sizeof(float);
float a = 5.1f;
float *hX = new float[n], *hY = new float[n], *hOut = new float[n];
for (size_t i = 0; i < n; ++i) {
hX[i] = static_cast<float>(i);
hY[i] = static_cast<float>(i * 2);
}
CUdeviceptr dX, dY, dOut;
CUDA_SAFE_CALL(cuMemAlloc(&dX, bufferSize));
CUDA_SAFE_CALL(cuMemAlloc(&dY, bufferSize));
CUDA_SAFE_CALL(cuMemAlloc(&dOut, bufferSize));
CUDA_SAFE_CALL(cuMemcpyHtoD(dX, hX, bufferSize));
CUDA_SAFE_CALL(cuMemcpyHtoD(dY, hY, bufferSize));
// Execute SAXPY.
void *args[] = { &a, &dX, &dY, &dOut, &n };
CUDA_SAFE_CALL(
cuLaunchKernel(kernel,
NUM_BLOCKS, 1, 1, // grid dim
NUM_THREADS, 1, 1, // block dim
0, NULL, // shared mem and stream
args, 0)); // arguments
CUDA_SAFE_CALL(cuCtxSynchronize());
// Retrieve and print output.
CUDA_SAFE_CALL(cuMemcpyDtoH(hOut, dOut, bufferSize));
for (size_t i = 0; i < n; ++i) {
std::cout << a << " * " << hX[i] << " + " << hY[i]
<< " = " << hOut[i] << '\n';
}
// Release resources.
CUDA_SAFE_CALL(cuMemFree(dX));
CUDA_SAFE_CALL(cuMemFree(dY));
CUDA_SAFE_CALL(cuMemFree(dOut));
CUDA_SAFE_CALL(cuModuleUnload(module));
CUDA_SAFE_CALL(cuCtxDestroy(context));
free(cubin);
delete[] hX;
delete[] hY;
delete[] hOut;
delete[] LTOIR;
return 0;
}
# nvcc -arch lto_89 -rdc=true -fatbin offline.cu
# g++ online.cpp -o online -I/usr/local/cuda/include -L/usr/local/cuda/lib64 -lnvrtc -lnvJitLink -lcuda -Wl,-rpath,/usr/local/cuda/lib64
# compute-sanitizer ./online
========= COMPUTE-SANITIZER
5.1 * 0 + 0 = 0
5.1 * 1 + 2 = 7.1
5.1 * 2 + 4 = 14.2
5.1 * 3 + 6 = 21.3
5.1 * 4 + 8 = 28.4
5.1 * 5 + 10 = 35.5
5.1 * 6 + 12 = 42.6
5.1 * 7 + 14 = 49.7
5.1 * 8 + 16 = 56.8
5.1 * 9 + 18 = 63.9
5.1 * 10 + 20 = 71
5.1 * 11 + 22 = 78.1
5.1 * 12 + 24 = 85.2
5.1 * 13 + 26 = 92.3
5.1 * 14 + 28 = 99.4
5.1 * 15 + 30 = 106.5
5.1 * 16 + 32 = 113.6
5.1 * 17 + 34 = 120.7
5.1 * 18 + 36 = 127.8
5.1 * 19 + 38 = 134.9
5.1 * 20 + 40 = 142
5.1 * 21 + 42 = 149.1
5.1 * 22 + 44 = 156.2
5.1 * 23 + 46 = 163.3
5.1 * 24 + 48 = 170.4
5.1 * 25 + 50 = 177.5
5.1 * 26 + 52 = 184.6
5.1 * 27 + 54 = 191.7
5.1 * 28 + 56 = 198.8
5.1 * 29 + 58 = 205.9
5.1 * 30 + 60 = 213
5.1 * 31 + 62 = 220.1
5.1 * 32 + 64 = 227.2
5.1 * 33 + 66 = 234.3
5.1 * 34 + 68 = 241.4
5.1 * 35 + 70 = 248.5
5.1 * 36 + 72 = 255.6
5.1 * 37 + 74 = 262.7
5.1 * 38 + 76 = 269.8
5.1 * 39 + 78 = 276.9
5.1 * 40 + 80 = 284
5.1 * 41 + 82 = 291.1
5.1 * 42 + 84 = 298.2
5.1 * 43 + 86 = 305.3
5.1 * 44 + 88 = 312.4
5.1 * 45 + 90 = 319.5
5.1 * 46 + 92 = 326.6
5.1 * 47 + 94 = 333.7
5.1 * 48 + 96 = 340.8
... (output reduced for posting)
5.1 * 4090 + 8180 = 29039
5.1 * 4091 + 8182 = 29046.1
5.1 * 4092 + 8184 = 29053.2
5.1 * 4093 + 8186 = 29060.3
5.1 * 4094 + 8188 = 29067.4
5.1 * 4095 + 8190 = 29074.5
========= ERROR SUMMARY: 0 errors
#
The only change I made from the documented content was to change the offline.cu arch specification from 52 to 89 to match my GPU.
In order to diagnose what may be happening in your case, we’ll need a complete description:
- Are you making any changes at all to the documented example?
- What CUDA version are you using?
- What operating system are you running on?
- What GPU are you running on?
- Provide a full console example, just as I have done.
Without all 5 of those things, I probably wouldn’t be able to respond further.
@Robert_Crovella hi, our code is private and I cannot post it, but we have indeed encountered this issue, which is very fatal to our product. Can I add you as a TG or DC friend? Can you help me check this issue online? It has been bothering us for almost two months now. Also, we have a lot of Nvidia GPU. Thank you very much
or a google meeting?
So I guess you are not using the demo code, exactly, or wanting to reproduce the demo exactly, which is what I had understood from this:
As a first step/verification, if it were me working on this, I would want to first verify that I can reproduce (exactly) the demo, just as I have shown, so that I could rule out environmental issues.
After that, if I were working on it, and particularly if I wanted help from others, and were asking for such in a public forum, I would work to generate a minimal reproducer. This has a number of positive attributes:
- I may end up with an example that is sufficiently anonymized that I can post it on a public forum without IP concerns
- I may actually discover a more narrow test case which makes diagnosis in a public forum possible
- I may discover the problem myself, effectively not needing help in a public forum any longer
It’s a fairly powerful debugging methodology - to produce the simplest possible case that still shows the issue. It’s commonly used and suggested on public help forums.
Beyond that, you may get some clues from the error code itself, and the point it occurs at. For example - try full offline compilation of your code, perhaps following the LTO path in the nvrtc/nvjit example, to make sure there are no compilation/linking errors when compiling offline. Also, when people are having trouble with CUDA tools from NVIDIA, I often suggest retrying the situation on the latest CUDA version available. Bugs are always possible. Bugs get fixed all the time.
Anyway, I don’t generally get into private consultation based on my forum posts. If you have NVIDIA contacts (e.g. that call on your organization or firm), then you may wish to reach out to them - they may be able to facilitate private consultation.