Error: kernel launch from __device__ or __global__ functions requires separate compilation mode

Hi, I’m beginner. I’m learning CUDA C by working with problem in Cousera

Trying to run make clean build, this error appeared:

coder@c6afbe98144b:~/project$ make clean build
rm -f simple.exe
nvcc -I./ -I/usr/local/cuda/include -lcuda --std c++17 simple.cu -o simple.exe
simple.cu(133): error: kernel launch from __device__ or __global__ functions requires separate compilation mode

simple.cu(233): error: a __global__ function call must be configured

2 errors detected in the compilation of "simple.cu".
make: *** [Makefile:8: build] Error 1

I looked forward this error but didn’t find any document for this error.

Below is system file:
simple.cu:

/*
 * Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.
 *
 * Please refer to the NVIDIA end user license agreement (EULA) associated
 * with this source code for terms and conditions that govern your use of
 * this software. Any use, reproduction, disclosure, or distribution of
 * this software and related documentation outside the terms of the EULA
 * is strictly prohibited.
 *
 */

/*
 * Vector multiplication: C = A * B.
 *
 * This sample is a very basic sample that implements element by element
 * vector multiplication. It is based on the sample illustrating Chapter 2
 * of the programming guide with some additions like error checking.
 */

#include "simple.h"

/*
 * CUDA Kernel Device code
 *
 * Computes the vector product of A and B into C. The 3 vectors have the same
 * number of elements numElements.
 */
__global__ void vectorMult(const float *A, const float *B, float *C, int numElements)
{
    int i = blockDim.x * blockIdx.x + threadIdx.x;

    if (i < numElements)
    {
        C[i] = deviceMultiply(A[i], B[i]);
    }
}

float deviceMultiply(float a, float b)
{
    return a * b;
}

__host__ std::tuple<float *, float *, float *> allocateHostMemory(int numElements)
{
    size_t size = numElements * sizeof(float);

    // Allocate the host input vector A
    float *h_A = (float *)malloc(size);

    // Allocate the host input vector B
    float *h_B = (float *)malloc(size);

    // Allocate the host output vector C
    float *h_C = (float *)malloc(size);

    // Verify that allocations succeeded
    if (h_A == NULL || h_B == NULL || h_C == NULL)
    {
        fprintf(stderr, "Failed to allocate host vectors!\n");
        exit(EXIT_FAILURE);
    }

    // Initialize the host input vectors
    for (int i = 0; i < numElements; ++i)
    {
        h_A[i] = rand()/(float)RAND_MAX;
        h_B[i] = rand()/(float)RAND_MAX;
    }

    return {h_A, h_B, h_C};
}

__device__ std::tuple<float *, float *, float *> allocateDeviceMemory(int numElements)
{
    // Allocate the device input vector A
    float *d_A = NULL;
    size_t size = numElements * sizeof(float);
    cudaError_t err = cudaMalloc(&d_A, size);
    if (err != cudaSuccess)
    {
        fprintf(stderr, "Failed to allocate device vector A (error code %s)!\n", cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }

    // Allocate the device input vector B
    float *d_B = NULL;
    err = cudaMalloc(&d_B, size);
    if (err != cudaSuccess)
    {
        fprintf(stderr, "Failed to allocate device vector B (error code %s)!\n", cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }

    // Allocate the device output vector C
    float *d_C = NULL;
    err = cudaMalloc(&d_C, size);
    if (err != cudaSuccess)
    {
        fprintf(stderr, "Failed to allocate device vector C (error code %s)!\n", cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }
    return {d_A, d_B, d_C};
}

__host__ void copyFromHostToDevice(float *h_A, float *h_B, float *d_A, float *d_B, int numElements)
{
    size_t size = numElements * sizeof(float);
    // Copy the host input vectors A and B in host memory to the device input vectors in device memory
    printf("Copy input data from the host memory to the CUDA device\n");
    cudaError_t err = cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
    if (err != cudaSuccess)
    {
        fprintf(stderr, "Failed to copy vector A from host to device (error code %s)!\n", cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }

    err = cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);
    if (err != cudaSuccess)
    {
        fprintf(stderr, "Failed to copy vector B from host to device (error code %s)!\n", cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }
}

__global__ void executeKernel(float *d_A, float *d_B, float *d_C, int numElements)
{
    // Launch the Vector Add CUDA Kernel
    int threadsPerBlock = 256;
    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
    printf("CUDA kernel launch with %d blocks of %d threads\n", blocksPerGrid, threadsPerBlock);

    // REPLACE x, y, z with a, b, and c variables for memory on the GPU
    vectorMult<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, numElements);
    cudaError_t err = cudaGetLastError();

    if (err != cudaSuccess)
    {
        fprintf(stderr, "Failed to launch vectorAdd kernel (error code %s)!\n", cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }
}

__host__ void copyFromDeviceToHost(float *d_C, float *h_C, int numElements)
{
    size_t size = numElements * sizeof(float);
    // Copy the device result vector in device memory to the host result vector
    // in host memory.
    printf("Copy output data from the CUDA device to the host memory\n");
    cudaError_t err = cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);

    if (err != cudaSuccess)
    {
        fprintf(stderr, "Failed to copy vector C from device to host (error code %s)!\n", cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }
}

// Free device global memory
__device__ void deallocateMemory(float *h_A, float *h_B, float *h_C, float *d_A, float *d_B, float *d_C)
{
    // Error code to check return values for CUDA calls
    cudaError_t err = cudaFree(d_A);
    if (err != cudaSuccess)
    {
        fprintf(stderr, "Failed to free device vector A (error code %s)!\n", cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }

    err = cudaFree(d_B);
    if (err != cudaSuccess)
    {
        fprintf(stderr, "Failed to free device vector B (error code %s)!\n", cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }

    err = cudaFree(d_C);
    if (err != cudaSuccess)
    {
        fprintf(stderr, "Failed to free device vector C (error code %s)!\n", cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }

    // Free host memory
    free(h_A);
    free(h_B);
    free(h_C);
}

// Reset the device and exit
__host__ void cleanUpDevice()
{
    // cudaDeviceReset causes the driver to clean up all state. While
    // not mandatory in normal operation, it is good practice.  It is also
    // needed to ensure correct operation when the application is being
    // profiled. Calling cudaDeviceReset causes all profile data to be
    // flushed before the application exits
    cudaError_t err = cudaDeviceReset();

    if (err != cudaSuccess)
    {
        fprintf(stderr, "Failed to deinitialize the device! error=%s\n", cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }
}

__device__ void performTest(float *h_A, float *h_B, float *h_C, int numElements)
{
    // Verify that the result vector is correct
    for (int i = 0; i < numElements; ++i)
    {
        if (fabs((h_A[i] * h_B[i]) - h_C[i]) > 1e-5)
        {
            fprintf(stderr, "Result verification failed at element %d!\n", i);
            exit(EXIT_FAILURE);
        }
    }

    printf("Test PASSED\n");
}

/*
 * Host main routine
 */
int main(void)
{
    int numElements = 50000;
    printf("[Vector multiplication of %d elements]\n", numElements);

    auto[h_A, h_B, h_C] = allocateHostMemory(numElements);
    auto[d_A, d_B, d_C] = allocateDeviceMemory(numElements);
    copyFromHostToDevice(h_A, h_B, d_A, d_B, numElements);

    executeKernel(d_A, d_B, d_C, numElements);

    copyFromDeviceToHost(d_C, h_C, numElements);
    performTest(h_A, h_B, h_C, numElements);
    deallocateMemory(h_A, h_B, h_C, d_A, d_B, d_C);

    cleanUpDevice();
    printf("Done\n");
    return 0;
}

simple.h:

#include <stdio.h>
#include <tuple>

// For the CUDA runtime routines (prefixed with "cuda_")
#include <cuda_runtime.h>

__global__ void vectorMult(const float *A, const float *B, float *C, int numElements);
float deviceMultiply(float a, float b);
__host__ std::tuple<float *, float *, float *> allocateHostMemory(int numElements);
__device__ std::tuple<float *, float *, float *> allocateDeviceMemory(int numElements);
__host__ void copyFromHostToDevice(float *h_A, float *h_B, float *d_A, float *d_B, int numElements);
__global__ void executeKernel(float *d_A, float *d_B, float *d_C);
__host__ void copyFromDeviceToHost(float *d_C, float *h_C, int numElements);
__device__ void deallocateMemory(float *h_A, float *h_B, float *h_C, float *d_A, float *d_B, float *d_C);
void cleanUpDevice();
__device__ void performTest(float *h_A, float *h_B, float *h_C, int numElements);

Makefile:

IDIR=./
COMPILER=nvcc
COMPILER_FLAGS=-I$(IDIR) -I/usr/local/cuda/include -lcuda --std c++17

.PHONY: clean build run

build: simple.cu simple.h
	$(COMPILER) $(COMPILER_FLAGS) simple.cu -o simple.exe

clean:
	rm -f simple.exe

run:
	./simple.exe

all: clean build run

Let me know if you need more details

You’re evidently confused about the decorators __global__, __device__ and when to use them.

__global__ is used to mark a kernel definition only. It indicates code that will run on the device.

__device__ (by itself, when used to mark code) is used to mark code that will run on the device, but is not a kernel by itself.

vectorMult should be marked with __global__, what you have there is correct.

deviceMultiply is called from device code (from the vectorMult kernel.) it must be marked with __device__.

allocateDeviceMemory is called from host code and is expected to run on the host. It should not be marked with __device__.

executeKernel is host code. It should not be marked with __global__.

deallocateMemory is host code. It should not be marked with __device__.

performTest is host code. It should not be marked with __device__.

This was completely brilliant! Exactly what I needed to get my code to compile and run neatly. Thank you very much.

Thanks @Robert_Crovella for straight forward answer.
AFAIK, it’s simply just look at the code in main() and see which function called from main() should be marked with decorator __host__ whereas the function is called to perform calculation marked with __global__ and its child components is marked with __device__, Right?

That’s right. For the purpose of this discussion, at this level of understanding, the kernel launch syntax:

kernel_name<<<...>>>(...)

is the dividing line. The function defined with the name kernel_name needs to be decorated with the __global__ keyword. Anything called from kernel_name needs to be decorated with the __device__ keyword. Repeat for every kernel launch. Everything else can remain undecorated (which is equivalent to __host__ decoration, when using nvcc). This doesn’t take into account CUDA Dynamic Parallelism, and possibly other advanced topics (CUDA Driver API, other launch methods such as CG, decorating a function with both __host__ and __device__ etc.), but these topics are not relevant to the discussion at this level (beginner level - according to your own statement.)

I’m reading 3 first chapter of Programming Massively Parallel Processors: A Hands-on Approach 4th edition, it’s quite easy to understand for beginner like me. They go deep to distinguish 3 kind of different decorators in CUDA C. Any tips for beginner to read and practice code?

Hi, I also take the course.
To do “make”, i needed to modify main function the simple.cu file;
int threadsPerBlock = 256;
int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
copyFromHostToDevice<<<blocksPerGrid, threadsPerBlock>>>(h_A, h_B, d_A, d_B, numElements);

I think this is ok and seems current standard description.
But ,
cleanUpDevice();
in the main function makes an error;
simple.cu(244): error: a global function call must be configured

Actually, the cleanUpDevice is global function and device must do it, and should not be called on host.
So, I do not know how fix this error.

No, that is incorrect. You probably have a __global__ keyword decorating your cleanUpDevice function. The OP for this thread does not have that. If you have that, its incorrect. You should remove that decorator from cleanUpDevice.

The device must not do that function.

Thank you, i understand about the decoration.
Below decoration makes pass test
global void vectorMult
device float deviceMultiply
host std::tuple<float *, float *, float *> allocateHostMemory
host std::tuple<float *, float *, float *> allocateDeviceMemory
host void copyFromHostToDevice
host void executeKernel
host void copyFromDeviceToHost
host void deallocateMemory
host void cleanUpDevice
host void performTest

This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.