How to use kernel function correctly?

I try to use my kernel function, but an error occured.

Error code: 700
Error message: an illegal memory access was encountered

I try to print ptr in kernel function and Host.
In Host , it is not null
but in device it is null.

d_ppLensMeanDisp is not NULL

d_ppLensMeanDisp is NULL

__constant__ RawImageParameter d_rawImageParameter;
__constant__ DisparityParameter d_disparityParameter;
__constant__ FilterParameterDevice d_filterPatameterDevice; 
__device__ MicroImageParameterDevice d_microImageParameter; 
__device__ float* d_costVol;
__device__ float* d_rawDisp;
__device__ float* d_ppLensMeanDisp;
__device__ float* d_renderCache;
__device__ float* d_inputImg;
__device__ float* d_inputImgRec;
__device__ RanderMapPatch* d_ppRanderMapPatch;
__device__ float* d_tmp;
__device__ float* d_simg;
__device__ int* sx_begin, *sy_begin, *sx_end, *sy_end;
__device__ int* d_randerMapWidth, *d_randerMapHeight;

__constant__ float *d_fltMax;
__constant__ int d_meanDispLenRadius;
__constant__ int d_patchScale9;
__constant__ float d_randerScale;
__constant__ int d_destWidth;
__constant__ int d_destHeight;

__global__ void testKernel() {
    int x = threadIdx.x + blockIdx.x * blockDim.x;
    int y = threadIdx.y + blockIdx.y * blockDim.y;

    // 在一个特定的索引位置修改 d_ppLensMeanDisp
   // printf("m_xLensNum = %d, m_yLensNum = %d\n", d_rawImageParameter.m_xLensNum, d_rawImageParameter.m_yLensNum);
    
    
    if (x < d_rawImageParameter.m_xLensNum && y < d_rawImageParameter.m_yLensNum) {
        int index = y * d_rawImageParameter.m_xLensNum + x;
        if (index == 0) {  
			if (d_ppLensMeanDisp == nullptr) {
				printf("d_ppLensMeanDisp is NULL\n");
			}
			else{
				printf("d_ppLensMeanDisp is not NULL\n");
			}
            d_ppLensMeanDisp[index] = 42.0f;  
        }
    }
}

void DataParameter::mapToGPU()
{
    CUDA_CHECK(cudaMemcpyToSymbol(d_rawImageParameter, &m_rawImageParameter, sizeof(RawImageParameter)));
    CUDA_CHECK(cudaMemcpyToSymbol(d_disparityParameter, &m_disparityParameter, sizeof(DisparityParameter)));

    float fltMax = FLT_MAX;
    CUDA_CHECK(cudaMemcpyToSymbol(d_fltMax, &fltMax, sizeof(float)));

    int* d_validNeighborPixelsNum;
    int* d_validPixelsMask;
    float* d_filterKernel;

    CUDA_CHECK(cudaMalloc((void**)&d_validNeighborPixelsNum, m_filterPatameter.m_pValidNeighborPixelsNum->total() * sizeof(int)));
    CUDA_CHECK(cudaMemcpy(d_validNeighborPixelsNum, m_filterPatameter.m_pValidNeighborPixelsNum->data, 
                          m_filterPatameter.m_pValidNeighborPixelsNum->total() * sizeof(int), cudaMemcpyHostToDevice));

    CUDA_CHECK(cudaMalloc((void**)&d_validPixelsMask, m_filterPatameter.m_pValidPixelsMask->total() * sizeof(int)));
    CUDA_CHECK(cudaMemcpy(d_validPixelsMask, m_filterPatameter.m_pValidPixelsMask->data, 
                          m_filterPatameter.m_pValidPixelsMask->total() * sizeof(int), cudaMemcpyHostToDevice));

    CUDA_CHECK(cudaMalloc((void**)&d_filterKernel, m_filterPatameter.m_filterKnernel.total() * sizeof(float)));
    CUDA_CHECK(cudaMemcpy(d_filterKernel, m_filterPatameter.m_filterKnernel.data, 
                          m_filterPatameter.m_filterKnernel.total() * sizeof(float), cudaMemcpyHostToDevice));

    FilterParameterDevice filterParamDevice = { d_validNeighborPixelsNum, d_validPixelsMask, d_filterKernel };
    CUDA_CHECK(cudaMemcpyToSymbol(d_filterPatameterDevice, &filterParamDevice, sizeof(FilterParameterDevice)));

    int rows = m_rawImageParameter.m_yLensNum;
    int cols = m_rawImageParameter.m_xLensNum;
    int srcImgHeight = m_rawImageParameter.m_srcImgHeight;
    int srcImgWidth = m_rawImageParameter.m_srcImgWidth;
    int neighborNum = NEIGHBOR_MATCH_LENS_NUM;


	CUDA_CHECK(cudaMalloc((void**)&d_microImageParameter, sizeof(MicroImageParameterDevice)));


    MicroImageParameterDevice h_microImageParameterDevice;

  
    h_microImageParameterDevice.m_circleDiameter = m_microImageParameter.m_circleDiameter;
    h_microImageParameterDevice.m_circleNarrow = m_microImageParameter.m_circleNarrow;
    h_microImageParameterDevice.m_radiusDisEqu = m_microImageParameter.m_radiusDisEqu;


    int lensCenterPointsSize = m_rawImageParameter.m_yLensNum * m_rawImageParameter.m_xLensNum * sizeof(cv::Point2d);
    CUDA_CHECK(cudaMalloc((void**)&h_microImageParameterDevice.m_ppLensCenterPoints, lensCenterPointsSize));
    CUDA_CHECK(cudaMemcpy(h_microImageParameterDevice.m_ppLensCenterPoints, m_microImageParameter.m_ppLensCenterPoints[0], lensCenterPointsSize, cudaMemcpyHostToDevice));

  
    int pixelsMappingSetSize = m_rawImageParameter.m_srcImgHeight * m_rawImageParameter.m_srcImgWidth * sizeof(int);
    CUDA_CHECK(cudaMalloc((void**)&h_microImageParameterDevice.m_ppPixelsMappingSet, pixelsMappingSetSize));
    CUDA_CHECK(cudaMemcpy(h_microImageParameterDevice.m_ppPixelsMappingSet, m_microImageParameter.m_ppPixelsMappingSet[0], pixelsMappingSetSize, cudaMemcpyHostToDevice));

  
    int matchNeighborLensSize = m_rawImageParameter.m_yLensNum * m_rawImageParameter.m_xLensNum * NEIGHBOR_MATCH_LENS_NUM * sizeof(MatchNeighborLens);
    CUDA_CHECK(cudaMalloc((void**)&h_microImageParameterDevice.m_ppMatchNeighborLens, matchNeighborLensSize));
    CUDA_CHECK(cudaMemcpy(h_microImageParameterDevice.m_ppMatchNeighborLens, m_microImageParameter.m_ppMatchNeighborLens[0][0], matchNeighborLensSize, cudaMemcpyHostToDevice));


	CUDA_CHECK(cudaMemcpyToSymbol(d_microImageParameter, &h_microImageParameterDevice, sizeof(MicroImageParameterDevice)));

    CUDA_CHECK(cudaMalloc((void**)&d_costVol, m_disparityParameter.m_disNum * m_rawImageParameter.m_recImgHeight * m_rawImageParameter.m_recImgWidth * sizeof(float)));
    CUDA_CHECK(cudaMemset(d_costVol, 0, m_disparityParameter.m_disNum * m_rawImageParameter.m_recImgHeight * m_rawImageParameter.m_recImgWidth * sizeof(float)));
    CUDA_CHECK(cudaMalloc((void**)&d_rawDisp, m_rawImageParameter.m_recImgHeight * m_rawImageParameter.m_recImgWidth * sizeof(float)));
    CUDA_CHECK(cudaMemset(d_rawDisp, 15, m_rawImageParameter.m_recImgHeight * m_rawImageParameter.m_recImgWidth * sizeof(float)));
    CUDA_CHECK(cudaMalloc(&d_ppLensMeanDisp, m_rawImageParameter.m_yLensNum * m_rawImageParameter.m_xLensNum * sizeof(float)));
    CUDA_CHECK(cudaMalloc(&d_ppRanderMapPatch, m_rawImageParameter.m_yLensNum * m_rawImageParameter.m_xLensNum * sizeof(RanderMapPatch)));
    CUDA_CHECK(cudaMalloc(&d_tmp, DEST_WIDTH * DEST_HEIGHT * 3 * sizeof(float)));
    CUDA_CHECK(cudaMalloc(&d_simg, DEST_WIDTH * DEST_HEIGHT * 3 * sizeof(float)));
    CUDA_CHECK(cudaMalloc(&sx_begin, sizeof(int)));
    CUDA_CHECK(cudaMalloc(&sy_begin, sizeof(int)));
    CUDA_CHECK(cudaMalloc(&sx_end, sizeof(int)));
    CUDA_CHECK(cudaMalloc(&sy_end, sizeof(int)));
    CUDA_CHECK(cudaMemset(sx_begin, INT_MAX, sizeof(int)));
    CUDA_CHECK(cudaMemset(sy_begin, INT_MAX, sizeof(int)));
    CUDA_CHECK(cudaMemset(sx_end, INT_MIN, sizeof(int)));
    CUDA_CHECK(cudaMemset(sy_end, INT_MIN, sizeof(int)));
    CUDA_CHECK(cudaMalloc(&d_randerMapWidth, sizeof(int)));
    CUDA_CHECK(cudaMalloc(&d_randerMapHeight, sizeof(int)));


	int meanDispLenRadius = MEAN_DISP_LEN_RADIUS;
    int patchScale9 = PATCH_SCALE9;
    float randerScale = RANDER_SCALE;
    int destWidth = DEST_WIDTH;
    int destHeight = DEST_HEIGHT;

    cudaMemcpyToSymbol(d_meanDispLenRadius, &meanDispLenRadius, sizeof(int));
    cudaMemcpyToSymbol(d_patchScale9, &patchScale9, sizeof(int));
    cudaMemcpyToSymbol(d_randerScale, &randerScale, sizeof(float));
    cudaMemcpyToSymbol(d_destWidth, &destWidth, sizeof(int));
    cudaMemcpyToSymbol(d_destHeight, &destHeight, sizeof(int));

	if (d_ppLensMeanDisp == nullptr) {
		printf("d_ppLensMeanDisp is NULL\n");
	}
	else{
		printf("d_ppLensMeanDisp is not NULL\n");
	}
	dim3 blockDim(32, 32);  
    dim3 gridDim((m_rawImageParameter.m_xLensNum + blockDim.x - 1) / blockDim.x, 
                 (m_rawImageParameter.m_yLensNum + blockDim.y - 1) / blockDim.y);
	testKernel<<<gridDim, blockDim>>>();
    CUDA_CHECK(cudaGetLastError());
    CUDA_CHECK(cudaDeviceSynchronize());
}

As mentioned in your other thread, it is not allowed to directly access __device__ variables from the host.
You cannot use a __device__ variable in cudaMalloc on the host. The programming guide shows how to do it correctly. 1. Introduction — CUDA C++ Programming Guide

__device__ float* devPointer;
float* ptr;
cudaMalloc(&ptr, 256 * sizeof(float));
cudaMemcpyToSymbol(devPointer, &ptr, sizeof(ptr));

Thanks !

This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.