atomicAdd always cause cuda error illegal address

__global__ void ImgDiff(cv::cuda::PtrStepSz<uchar3> img_a,cv::cuda::PtrStepSz<uchar3> img_b,cv::cuda::PtrStepSz<float> his_ret)
{
    int w = blockIdx.x;
    int h = blockIdx.y;

    int offset = threadIdx.x; 
    int grid_x= gridDim.x; 


    if ((h+offset)<grid_x){
        int ba = img_a(w+offset,h).x;
        int ga = img_a(w+offset,h).y;
        int ra = img_a(w+offset,h).z;

        int bb = img_b(w,h).x;
        int gb = img_b(w,h).y;
        int rb = img_b(w,h).z;

        int color = (abs(ra-rb)+abs(ga-gb)+abs(ba-bb));
        
        if (color != 0)
        {
            atomicAdd(&(his_ret(offset,color)),(float)color);  
        }
                
    }
}

int main(){
    cv::Mat imga = cv::imread("2640393.jpeg");
    cv::cuda::GpuMat gpu_imga(imga);
    
    float* ret_data;
    size_t ret_size = sizeof(float)*256*30;
    cudaMalloc(&ret_data,ret_size);
    cudaMemset((void *)ret_data,0,ret_size);
    cv::cuda::PtrStepSz<float> ret(30,256,ret_data,sizeof(float)*256);

    ImgDiff<<<{2560,1920},30>>>(gpu_imga,gpu_imga,ret);

    float* ret_data_host;
    ret_data_host = (float*)malloc(ret_size);
    cudaError_t ErrCode = cudaDeviceSynchronize();
    cudaMemcpy(ret_data_host,ret_data,ret_size,cudaMemcpyDeviceToHost);

    printf("cuda err code %d",ErrCode);

}

when I stop using atomicAdd,this program is run sucessful,but once I use it,even with the most simple params like (*int,int) that I define and initate in main function, but it still dosn’t work.
this device is Jetson Xavier NX with Jetpacke 4.6,CUDA version is 10.2.3.

Do you have an API link to cv::cuda::PtrStepSz ? What is the return value type of his_ret(offset,color) ?

1 Like

sure, here is the link about cv::cuda::PtrStepSz

It’s seem to be related with the size of block, when I change ImgDiff<<<{2560,1920},30>>>(gpu_imga,gpu_imga,ret); to ImgDiff<<<{480,640},30>>>(gpu_imga,gpu_imga,ret); every things just working fine.

The linked documentation does not specify a function like PtrStepSz::operator()(int x, int y), which you seem to be using here.

If it’s return type is not a reference to an array element, the address argument for atomicAdd will be invalid.

1 Like

Here is the source file of PtrStepSz

/*M///////////////////////////////////////////////////////////////////////////////////////
//
//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
//  By downloading, copying, installing or using the software you agree to this license.
//  If you do not agree to this license, do not download, install,
//  copy or use the software.
//
//
//                           License Agreement
//                For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
//   * Redistribution's of source code must retain the above copyright notice,
//     this list of conditions and the following disclaimer.
//
//   * Redistribution's in binary form must reproduce the above copyright notice,
//     this list of conditions and the following disclaimer in the documentation
//     and/or other materials provided with the distribution.
//
//   * The name of the copyright holders may not be used to endorse or promote products
//     derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/

#ifndef OPENCV_CORE_CUDA_TYPES_HPP
#define OPENCV_CORE_CUDA_TYPES_HPP

#ifndef __cplusplus
#  error cuda_types.hpp header must be compiled as C++
#endif

#if defined(__OPENCV_BUILD) && defined(__clang__)
#pragma clang diagnostic ignored "-Winconsistent-missing-override"
#endif
#if defined(__OPENCV_BUILD) && defined(__GNUC__) && __GNUC__ >= 5
#pragma GCC diagnostic ignored "-Wsuggest-override"
#endif

/** @file
 * @deprecated Use @ref cudev instead.
 */

//! @cond IGNORED

#ifdef __CUDACC__
    #define __CV_CUDA_HOST_DEVICE__ __host__ __device__ __forceinline__
#else
    #define __CV_CUDA_HOST_DEVICE__
#endif

namespace cv
{
    namespace cuda
    {

        // Simple lightweight structures that encapsulates information about an image on device.
        // It is intended to pass to nvcc-compiled code. GpuMat depends on headers that nvcc can't compile

        template <typename T> struct DevPtr
        {
            typedef T elem_type;
            typedef int index_type;

            enum { elem_size = sizeof(elem_type) };

            T* data;

            __CV_CUDA_HOST_DEVICE__ DevPtr() : data(0) {}
            __CV_CUDA_HOST_DEVICE__ DevPtr(T* data_) : data(data_) {}

            __CV_CUDA_HOST_DEVICE__ size_t elemSize() const { return elem_size; }
            __CV_CUDA_HOST_DEVICE__ operator       T*()       { return data; }
            __CV_CUDA_HOST_DEVICE__ operator const T*() const { return data; }
        };

        template <typename T> struct PtrSz : public DevPtr<T>
        {
            __CV_CUDA_HOST_DEVICE__ PtrSz() : size(0) {}
            __CV_CUDA_HOST_DEVICE__ PtrSz(T* data_, size_t size_) : DevPtr<T>(data_), size(size_) {}

            size_t size;
        };

        template <typename T> struct PtrStep : public DevPtr<T>
        {
            __CV_CUDA_HOST_DEVICE__ PtrStep() : step(0) {}
            __CV_CUDA_HOST_DEVICE__ PtrStep(T* data_, size_t step_) : DevPtr<T>(data_), step(step_) {}

            size_t step;

            __CV_CUDA_HOST_DEVICE__       T* ptr(int y = 0)       { return (      T*)( (      char*)(((DevPtr<T>*)this)->data) + y * step); }
            __CV_CUDA_HOST_DEVICE__ const T* ptr(int y = 0) const { return (const T*)( (const char*)(((DevPtr<T>*)this)->data) + y * step); }

            __CV_CUDA_HOST_DEVICE__       T& operator ()(int y, int x)       { return ptr(y)[x]; }
            __CV_CUDA_HOST_DEVICE__ const T& operator ()(int y, int x) const { return ptr(y)[x]; }
        };

        template <typename T> struct PtrStepSz : public PtrStep<T>
        {
            __CV_CUDA_HOST_DEVICE__ PtrStepSz() : cols(0), rows(0) {}
            __CV_CUDA_HOST_DEVICE__ PtrStepSz(int rows_, int cols_, T* data_, size_t step_)
                : PtrStep<T>(data_, step_), cols(cols_), rows(rows_) {}

            template <typename U>
            explicit PtrStepSz(const PtrStepSz<U>& d) : PtrStep<T>((T*)d.data, d.step), cols(d.cols), rows(d.rows){}

            int cols;
            int rows;
        };

        typedef PtrStepSz<unsigned char> PtrStepSzb;
        typedef PtrStepSz<unsigned short> PtrStepSzus;
        typedef PtrStepSz<float> PtrStepSzf;
        typedef PtrStepSz<int> PtrStepSzi;

        typedef PtrStep<unsigned char> PtrStepb;
        typedef PtrStep<unsigned short> PtrStepus;
        typedef PtrStep<float> PtrStepf;
        typedef PtrStep<int> PtrStepi;

    }
}

//! @endcond

#endif /* OPENCV_CORE_CUDA_TYPES_HPP */

I fix this code by changing the size of block,still have no idea how this work,but it’s true that this problem somehow related with number of threads.Thank for your time to helping me out.

Ok. the function call is fine. This is not the problem of your code.

You should verify that your indexing is correct. Maybe work through an example on paper.

For example, you check ((h+offset)<grid_x) , but use w+offset to index the x-dimension of the image.
Then, img_a(w+offset,h) is effectively img_a(blockIdx.x+threadIdx.x, blockIdx.y) . According to your kernel parameters, your input image has at least 2560+30 rows and 1920 columns. Is that correct?