Invalid argument when creating 3D texture object in Driver API

I’m trying to write a program that will linearly interpolate into a 3D block of 32-bit floats. It seems like texture objects should be the easiest and fastest way about this. However, when I try to run some basic code to do this, it fails at cuTexObjectCreate() with an invalid argument.

Some extra context: the end program is in Java, and I’m interfacing with CUDA through the Jcuda bindings. In that case, it fails at the same step with an access violation from within nvcuda.dll. To figure out whether the issue was in CUDA or Jcuda, I duplicated the program in C. Below is the C source:

#include "cuda.h"

#include <stdio.h>

#define checkCudaErrors(err)  __checkCudaErrors (err, __FILE__, __LINE__)

CUdevice cuDevice;
CUcontext cuContext;

CUresult initCUDA();

// These are the inline versions for all of the SDK helper functions
inline void __checkCudaErrors(CUresult err, const char *file, const int line) {
	if (CUDA_SUCCESS != err) {
		const char* pStr;
		cuGetErrorString(err, &pStr);
		fprintf(stderr, "checkCudaErrors() Driver API error = %04d \"%s\" from file <%s>, line %i.\n",
				err, pStr , file, line);
		exit(EXIT_FAILURE);
	}
}

int main()
{
	checkCudaErrors(initCUDA());

	const size_t dim = 2;
	float hostArray[dim * dim * dim] = { 0,1,2,3,4,5,6,7 };

	CUdeviceptr deviceArray = (CUdeviceptr)NULL;
	checkCudaErrors(cuMemAlloc(&deviceArray, dim * dim * dim * sizeof(float)));
	checkCudaErrors(cuMemcpyHtoD(deviceArray, hostArray, dim * dim * dim * sizeof(float)));

	CUDA_ARRAY3D_DESCRIPTOR arrDesc;
	arrDesc.Format = CU_AD_FORMAT_FLOAT;
	arrDesc.Width = dim;
	arrDesc.Height = dim;
	arrDesc.Depth = dim;
	arrDesc.NumChannels = 1;
	arrDesc.Flags = 0;

	CUarray cuArray;
	checkCudaErrors(cuArray3DCreate(&cuArray, &arrDesc));

	CUDA_MEMCPY3D copyParams;
	copyParams.srcMemoryType = CU_MEMORYTYPE_DEVICE;
	copyParams.srcDevice = deviceArray;
	copyParams.srcXInBytes = 0;
	copyParams.srcY = 0;
	copyParams.srcZ = 0;
	copyParams.srcPitch = dim * sizeof(float);
	copyParams.srcHeight = dim;
	copyParams.srcLOD = 0;

	copyParams.dstMemoryType = CU_MEMORYTYPE_ARRAY;
	copyParams.dstArray = cuArray;
	copyParams.dstXInBytes = 0;
	copyParams.dstY = 0;
	copyParams.dstZ = 0;
	copyParams.dstLOD = 0;

	copyParams.WidthInBytes = dim * sizeof(float);
	copyParams.Height = dim;
	copyParams.Depth = dim;
	
	checkCudaErrors(cuMemcpy3D(&copyParams));

	CUDA_RESOURCE_DESC resDesc;
	resDesc.resType = CU_RESOURCE_TYPE_ARRAY;
	resDesc.res.array.hArray = cuArray;
	resDesc.flags = 0;

	CUDA_TEXTURE_DESC texDesc;
	texDesc.addressMode[0] = CU_TR_ADDRESS_MODE_CLAMP;
	texDesc.addressMode[1] = CU_TR_ADDRESS_MODE_CLAMP;
	texDesc.addressMode[2] = CU_TR_ADDRESS_MODE_CLAMP;
	texDesc.filterMode = CU_TR_FILTER_MODE_LINEAR;
	texDesc.flags = 0;
	texDesc.maxAnisotropy = 1;
	texDesc.mipmapFilterMode = CU_TR_FILTER_MODE_POINT;
	texDesc.mipmapLevelBias = 0;
	texDesc.minMipmapLevelClamp = 0;
	texDesc.maxMipmapLevelClamp = 0;

	// commenting this out because it seems to be kosher to set it to null
	/*CUDA_RESOURCE_VIEW_DESC viewDesc;
	viewDesc.format = CU_RES_VIEW_FORMAT_FLOAT_1X32;
	viewDesc.width = dim;
	viewDesc.height = dim;
	viewDesc.depth = dim;
	viewDesc.firstMipmapLevel = 0;
	viewDesc.lastMipmapLevel = 0;
	viewDesc.firstLayer = 0;
	viewDesc.lastLayer = 0;*/

	CUtexObject texture;
	printf("This is where the Java version hits an access violation:\n");
	checkCudaErrors(cuTexObjectCreate(&texture, &resDesc, &texDesc, NULL));
	printf("Seems the C version works?");

	checkCudaErrors(cuTexObjectDestroy(texture));
	checkCudaErrors(cuArrayDestroy(cuArray));

	cuCtxDestroy(cuContext);

    return 0;
}

static CUresult initCUDA() {
	checkCudaErrors(cuInit(0));

	CUfunction cuFunction = 0;
	CUresult status;
	int major = 0, minor = 0, devID = 0;
	char deviceName[100];

	cuDeviceGet(&cuDevice, devID);

	// get compute capabilities and the devicename
	checkCudaErrors(cuDeviceComputeCapability(&major, &minor, cuDevice));
	checkCudaErrors(cuDeviceGetName(deviceName, 256, cuDevice));
	printf("> GPU Device has SM %d.%d compute capability\n", major, minor);

	status = cuCtxCreate(&cuContext, 0, cuDevice);

	if (CUDA_SUCCESS != status) {
		printf("cuCtxCreate(0) returned %d\n\n", status);
		cuCtxDestroy(cuContext);
		return status;
	}

	return CUDA_SUCCESS;
}

This is running on a GeForce GTX 645, CC 3.0.