Segmentation Fault when compiling cu file on cuda7.0

Hi,

When I try to migrate the CUDA project from cuda6.5 to cuda7.0, some cu file fails to compile. NVCC compiler gives “Segmentation fault” message.

The failed code:

typedef float_t float_t;

__global__ void LRNForwardKernel(int nthreads, const float_t *in,
    int num, int numChannels, int height, int width, 
    int size, float_t k, float_t alpha_over_size, float_t beta,
    float_t *scale, float_t *out) {
	int id = blockDim.x*blockIdx.x + threadIdx.x;

	for(int index = id; index < nthreads; index += blockDim.x*gridDim.x) {
		int w = index % width;
		int h = (index / width) % height;
		int n = index / width / height;
		int offset = (n * numChannels * height + h) * width + w;
		int step = height * width;
		in += offset;
		out += offset;
		scale += offset;
		int head = 0;
		int pre_pad = (size - 1) / 2;
		int post_pad = size - pre_pad - 1;
		float_t accum_scale = 0;

		while (head < post_pad) {
			int idx = head*step;
			accum_scale += in[idx] * in[idx];
			++head;
		}
		//the first half 
		while (head < size) {
			int idx = head*step;
			accum_scale += in[idx] * in[idx];
			float_t s = k + accum_scale * alpha_over_size;

			idx = (head - post_pad) * step;
			scale[idx]  = s;
			out[idx] = in[idx]*pow(s, (float_t)-beta) ;
			++head;
		}
		//center 
		while (head < numChannels) {
			int idx = head*step;
			accum_scale += in[idx] * in[idx];
			idx = (head - size) * step;
			accum_scale -= in[idx] * in[idx];
			float_t s = k + accum_scale * alpha_over_size;

			idx = (head - post_pad) * step;
			scale[idx] = s;
			out[idx] = in[idx]*pow(s, (float_t)-beta) ;
			++head;
		}
		// the last half over the end
		while (head < numChannels + post_pad) {
			int idx = (head - size) * step;
			accum_scale -= in[idx] * in[idx];
			float_t s = k + accum_scale * alpha_over_size;

			idx = (head - post_pad) * step;
			scale[idx] = s;
			out[idx] = in[idx]*pow(s, (float_t)-beta) ;
			++head;
		}
	}
}

void LRNForward(int gridSize, int blockSize, cudaStream_t stream,
    int nthreads, const float_t *in,
    int num, int numChannels, int height, int width, 
    int size, float_t k, float_t alpha_over_size, float_t beta,
    float_t *scale, float_t *out)
{
    LRNForwardKernel<<<gridSize, blockSize, 0, stream>>>(
            nthreads, in, 
            num, numChannels, height, width,
            size, k, alpha_over_size, beta, scale, out);
}

__global__ void LRNBackwardKernel(const int nthreads, 
        const float_t *in_data, const float_t *out_data, 
        const float_t *scale, const float_t *out_diff,
        const int num, const int numChannels, const int height, const int width,
        const int size, const float_t negative_beta, const float_t cache_ratio, 
        float_t *in_diff) {
    int id = blockDim.x*blockIdx.x + threadIdx.x;

    for(int index = id; index < nthreads; index += blockDim.x*gridDim.x) {
        int w = index % width;
        int h = (index / width) % height;
        int n = index / width / height;
        int offset = (n * numChannels * height + h) * width + w;
        int step = height * width;
        in_data += offset;
        out_data += offset;
        scale += offset;
        out_diff += offset;
        in_diff += offset;
        int head = 0;
        int pre_pad = size - (size + 1) / 2;
        int post_pad = size - pre_pad - 1;
        float_t accum_ratio = 0;
        //add the first half
        while (head < post_pad) {
            int idx = head*step;
            accum_ratio += out_diff[idx] * out_data[idx]/scale[idx];
            ++head;
        }
        //compute the first half
        while (head < size) {
            int idx = head*step;
            accum_ratio += out_diff[idx] * out_data[idx] / scale[idx];

            idx = (head - post_pad) * step;
            in_diff[idx] = out_diff[idx]*pow(scale[idx], negative_beta) - 
                cache_ratio*in_data[idx]*accum_ratio;
            ++head;
        }

        while (head < numChannels) {
            int idx = head*step;
            accum_ratio += out_diff[idx]*out_data[idx]/scale[idx];

            idx = (head - size) * step;
            accum_ratio -= out_diff[idx]*out_data[idx]/scale[idx];

            idx = (head - post_pad) * step;
            in_diff[idx] = out_diff[idx]*pow(scale[idx], negative_beta) - 
                cache_ratio*in_data[idx]*accum_ratio;
            ++head;
        }
        //the last half
        while (head < numChannels + post_pad) {
            int idx = (head - size) * step;
            accum_ratio -= out_diff[idx]*out_data[idx]/scale[idx];

            idx = (head - post_pad) * step;
            in_diff[idx] = out_diff[idx]*pow(scale[idx], negative_beta) - 
                cache_ratio*in_data[idx] * accum_ratio;
            ++head;
        }
    }
}

void LRNBackward(int gridSize, int blockSize, cudaStream_t stream,
        const int nthreads, 
        const float_t *in_data, const float_t *out_data, 
        const float_t *scale, const float_t *out_diff,
        const int num, const int numChannels, const int height, const int width,
        const int size, const float_t negative_beta, const float_t cache_ratio, 
        float_t *in_diff) {
    LRNBackwardKernel<<<gridSize, blockSize, 0, stream>>>(
            nthreads,
            in_data, out_data,
            scale, out_diff,
            num, numChannels, height, width,
            size, negative_beta, cache_ratio, in_diff);
}

It will compile successfully if “++head” on line 37, 50 and 61 are commented out.

The compiling command:

/home/weiming/.local/cuda-7.0/bin/nvcc -ccbin=g++-4.8 lrn.cu -c

Platform: Debian GNU/Linux 7 (wheezy), CUDA 7.0, Driver version 352.21, Titan Black.

I think it might be a new bug on cuda7.0? Could anyone help me with that? Thanks in advance.

I can reproduce the problem on CUDA 7.0 but it appears to be fixed in CUDA 7.5RC. You can either download the release candidate now or wait for the production release in the near future.