openGL interop random segfault (bug?)

I’m on Ubuntu 14.04 with cuda 6.5, driver version 340.29. My application registers a pixel buffer from openGL and writes an image to the buffer every loop, copies the PBO using glTexSubImage2D, and draws the texture. This all works properly until I change my image-generation kernel, then I gdb reports a segmentation fault in cudaGraphicsGLRegisterBuffer. My guess is this is a bug, because the cuda kernel is completely unrelated to cudaGraphicsGLRegisterBuffer, which is called before any processing.

Makefile

CUDA=nvcc
CPP=g++

OUT=out

INC=-I/usr/local/cuda-6.5/include

LINK=-lcudart -lglfw -lGLEW -lGL

FLAGS=-std=gnu++11
CUFLAGS=-std=c++11

all: main.cu GLdisplay.cu
	$(CUDA) main.cu GLdisplay.cu -o $(OUT) $(CUFLAGS) $(INC) $(LINK)

clean:
	rm ./$(OUT)

add:
	git add -A
	git status

main.cu

#define  GLEW_STATIC

// C++ headers
#include <iostream>
#include <fstream>
#include <cstring>

// openGL headers
#include <GL/glew.h>
#include <GLFW/glfw3.h>

// CUDA headers
#include <cuda_runtime.h>
#include <cuda_gl_interop.h>

#include "GLdisplay.h"

#define WINDOW_WIDTH  640
#define WINDOW_HEIGHT 480

#define TEX_WIDTH     1920
#define TEX_HEIGHT    1080

using std::cout;
using std::cerr;
using std::endl;
using std::string;
using std::ifstream;

GLFWwindow* window;
GLuint vao, vbo, pbo;
GLuint vtx, frg, shaders;
GLuint tex;

uint8_t* cudaPBOptr;
size_t cudaPBOsize;

cudaGraphicsResource_t cuGfxPBO;

string loadTxtFileAsString( string filename )
{
	string source;
	string buf = "";
	ifstream file( filename, std::ios::in );

	while( file.good( ) )
	{
		std::getline( file, buf );
		source.append( buf + "\n" );
	}

	file.close( );

	return source;
}

void shaderCompileCheck( void )
{
	GLint status;

	// vertex
	glGetShaderiv( vtx, GL_COMPILE_STATUS, &status );

	if( GL_TRUE != status )
	{
		char buffer[ 512 ];
		glGetShaderInfoLog( vtx, 512, NULL, buffer );
		cerr << "vtx err | " << buffer << endl;
	}

	// fragment
	glGetShaderiv( frg, GL_COMPILE_STATUS, &status );

	if( GL_TRUE != status )
	{
		char buffer[ 512 ];
		glGetShaderInfoLog( frg, 512, NULL, buffer );
		cerr << "frg err | " << buffer << endl;
	}
}

#define cudaErr(err) cudaError( err, __FILE__, __LINE__ )
inline void cudaError( cudaError_t err, const char* file, uint32_t line )
{
	if( cudaSuccess != err )
	{
		cerr << "[" << file << ":" << line << "] ";
		cerr << cudaGetErrorName( err ) << endl;
	}
}

// main function
int main( int argc, char* argv[] )
{
	string vString = loadTxtFileAsString( "vert.glsl" );
	string fString = loadTxtFileAsString( "frag.glsl" );

	const GLchar* vtxSource = vString.c_str( );
	const GLchar* frgSource = fString.c_str( );

/////////////////////////////////////////////////

	glfwInit( );

	glfwWindowHint( GLFW_CONTEXT_VERSION_MAJOR, 3 );
	glfwWindowHint( GLFW_CONTEXT_VERSION_MINOR, 2 );
	glfwWindowHint( GLFW_OPENGL_PROFILE, GLFW_OPENGL_CORE_PROFILE );
	glfwWindowHint( GLFW_OPENGL_FORWARD_COMPAT, GL_TRUE );

	glfwWindowHint( GLFW_RESIZABLE, GL_FALSE );

	window = glfwCreateWindow( WINDOW_WIDTH, WINDOW_HEIGHT, "CUDA-GL", NULL, NULL );

	glfwMakeContextCurrent( window );

	glewExperimental = GL_TRUE;
	glewInit( );

	cudaErr( cudaSetDevice( 0 ) );

/////////////////////////////////////////////////

	GLfloat vertices[] = {
	//   X      Y     U     V
		-1.0f,  1.0f, 0.0f, 1.0f, // t l
		 1.0f,  1.0f, 1.0f, 1.0f, // t r
		-1.0f, -1.0f, 0.0f, 0.0f, // b l

		-1.0f, -1.0f, 0.0f, 0.0f, // b l
		 1.0f, -1.0f, 1.0f, 0.0f, // b r
		 1.0f,  1.0f, 1.0f, 1.0f  // t r
	};

	GLbyte pboInit[ TEX_WIDTH * TEX_HEIGHT * 4 ];
	memset( pboInit, 127, sizeof( pboInit ) );

/////////////////////////////////////////////////

	glGenVertexArrays( 1, &vao );
	glBindVertexArray( vao );

	glGenBuffers( 1, &pbo );
	glBindBuffer( GL_PIXEL_UNPACK_BUFFER, pbo );
	glBufferData( GL_PIXEL_UNPACK_BUFFER, TEX_WIDTH * TEX_HEIGHT * 4, pboInit, GL_STREAM_DRAW );

	glActiveTexture( GL_TEXTURE0 );
	glGenTextures( 1, &tex );
	glBindTexture( GL_TEXTURE_2D, tex );
	glTexImage2D( GL_TEXTURE_2D, 0, GL_RGBA, TEX_WIDTH, TEX_HEIGHT, 0, GL_RGBA, GL_UNSIGNED_BYTE, NULL );
	glTexParameteri( GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR );
	glTexParameteri( GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_LINEAR );

	glGenBuffers( 1, &vbo );
	glBindBuffer( GL_ARRAY_BUFFER, vbo );
	glBufferData( GL_ARRAY_BUFFER, sizeof( vertices ), vertices, GL_STATIC_DRAW );

/////////////////////////////////////////////////

	cudaErr( cudaGraphicsGLRegisterBuffer( &cuGfxPBO, pbo, cudaGraphicsMapFlagsNone ) );

/////////////////////////////////////////////////

	vtx = glCreateShader( GL_VERTEX_SHADER );
	glShaderSource( vtx, 1, &vtxSource, NULL );
	glCompileShader( vtx );

	frg = glCreateShader( GL_FRAGMENT_SHADER );
	glShaderSource( frg, 1, &frgSource, NULL );
	glCompileShader( frg );

	shaderCompileCheck( );

	shaders = glCreateProgram( );
	glAttachShader( shaders, vtx );
	glAttachShader( shaders, frg );

	glBindFragDataLocation( shaders, 0, "outColor" );

	glLinkProgram( shaders );
	glUseProgram( shaders );

/////////////////////////////////////////////////

	GLuint posAtt = glGetAttribLocation( shaders, "position" );
	glEnableVertexAttribArray( posAtt );
	glVertexAttribPointer( posAtt, 2, GL_FLOAT, GL_FALSE, 4 * sizeof( GLfloat ), NULL );

	GLuint texAtt = glGetAttribLocation( shaders, "texcoord" );
	glEnableVertexAttribArray( texAtt );
	glVertexAttribPointer( texAtt, 2, GL_FLOAT, GL_FALSE, 4 * sizeof( GLfloat ), ( void* )( 2 * sizeof( GLfloat ) ) );

/////////////////////////////////////////////////

	glClearColor( 0.0f, 0.0f, 0.0f, 1.0f );

	while( !glfwWindowShouldClose( window ) )
	{
		cudaErr( cudaGraphicsMapResources( 1, &cuGfxPBO, 0 ) );
		cudaErr( cudaGraphicsResourceGetMappedPointer( ( void** )&cudaPBOptr, &cudaPBOsize, cuGfxPBO ) );

		dim3 gridDim( ( int32_t )ceil( ( float )TEX_WIDTH / 16 ), ( int32_t )ceil( ( float )TEX_HEIGHT / 16 ) );
		dim3 blockDim( 16, 16 );

		makeImage<<< gridDim, blockDim >>>( cudaPBOptr, TEX_WIDTH, TEX_HEIGHT );
		cudaDeviceSynchronize( ); cudaErr( cudaGetLastError( ) );

		cudaErr( cudaGraphicsUnmapResources( 1, &cuGfxPBO, 0 ) );

/////////////////////////////////////////////////

		glBindBuffer( GL_PIXEL_UNPACK_BUFFER, pbo );
		glBindTexture( GL_TEXTURE_2D, tex );

		glTexSubImage2D( GL_TEXTURE_2D, 0, 0, 0, TEX_WIDTH, TEX_HEIGHT, GL_RGBA, GL_UNSIGNED_BYTE, NULL );

/////////////////////////////////////////////////

		glClear( GL_COLOR_BUFFER_BIT );

		glDrawArrays( GL_TRIANGLES, 0, 6 );

	    glfwSwapBuffers( window );

	    glfwPollEvents( );
	}

/////////////////////////////////////////////////

	glDeleteTextures( 1, &tex );

	glDeleteProgram( shaders );
	glDeleteShader( frg );
	glDeleteShader( vtx );

	glDeleteBuffers( 1, &pbo );
	glDeleteBuffers( 1, &vbo );

	glDeleteVertexArrays( 1, &vao );

	return 0;
}

GLdisplay.cu

#include <cuda_runtime.h>
#include <stdint.h>

#include "GLdisplay.h"

__global__ void makeImage( uint8_t* output, uint32_t width, uint32_t height )
{
	uint32_t xIdx = threadIdx.x + blockIdx.x * blockDim.x;
	uint32_t yIdx = threadIdx.y + blockIdx.y * blockDim.y;
	uint32_t oIdx = ( xIdx + yIdx * width ) * 4;

	if( xIdx >= width || yIdx >= height ) return;

// segfault occurs if any of the lines below are commented or set to write 0
	uint8_t red = ( uint8_t )( 255.0f * ( float )( xIdx + yIdx * width ) / ( width * height ) );
	uint8_t gre = ( uint8_t )( 255.0f * ( 1.0f - ( float )( xIdx + yIdx * width ) / ( width * height ) ) );
	uint8_t blu = ( uint8_t )( 510.0f * abs( ( float )( xIdx + yIdx * width ) / ( width * height ) ) - 0.5f );

	output[ oIdx + 0 ] = red;
	output[ oIdx + 1 ] = gre;
	output[ oIdx + 2 ] = blu;
	output[ oIdx + 3 ] = 255;

 }

GLdisplay.h

#ifndef GL_DISPLAY_H
#define GL_DISPLAY_H

__global__ void makeImage( uint8_t*, uint32_t, uint32_t );

#endif

vert.glsl

#version 150

in vec2 position;
in vec2 texcoord;

out vec2 Texcoord;

void main( )
{
	Texcoord = texcoord;
	gl_Position = vec4( position, 0.0, 1.0 );
}

frag.glsl

#version 150

in vec2 Texcoord;

out vec4 outColor;

uniform sampler2D tex;

void main( )
{
	outColor = texture( tex, Texcoord );
}