Hi Everybody,
I have an interesting issue with 3D texture interpolation that maybe someone else has seen (or can spot a mistake I’m making).
I am trying to interpolate a 3D volume with textures and without. I have recreated the trilinear interpolation method described in the CUDA C programming guide for the non-texture version so that in theory the results should be the same. Ill post the code here reducing to just the involved sections.
Im testing on both a Tesla c2050(CUDA TK3.2) and a GeForce G 210M (CUDA TK4) (both with identical results)
Im currently copying data, and calling the kernel with a MATLAB mex file but I have this texture issue without involving matlab so Im pretty sure its not involved.
I have a very simple test volume which is a 10x10x10 array of zeros with a cube of ones [4:6,4:6,4:6] in the centre. I copy the volume into a cudaArray texture and also a global memory array and i interpolate one with texture calls and the other with an interpolation function.
If i just sample the data in the texture right in the middle of each voxel (i.e. no interpolation) then I get the correct volume back.
The values that return from the interpolation function are what i would expect. but the values from the textures are oddly different
for example if i interpolate in 0.1 steps between a [0] voxel and a [1] voxel,
the interpolation function would give me [0,0.1 , 0.2 , 0.3 , 0.4 , 0.5, 0.6 , 0.7 , 0.8 , 0.9 , 1]; — correct
the texture interpolation would give me [0,0.1016, 0.1992, 0.3008, 0.3984, 0.5, 0.6016, 0.6992, 0.8008, 0.8984, 1]; — oddly incorrect
If anyone can offer a suggestion of where to look I would be very grateful and would owe them a drink.
heres the code, sorry if its not correct to dump code. (long time lurker, first time poster)
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
#include <cutil_inline.h>
#include <cutil.h>
#include “mex.h”
texture<float, 3, cudaReadModeElementType> VolComtex;
//-------------------------------------------------------------------------------------------------------------
// GPU INTERPOLATE MIMIC BEGIN
//interpolation function to mimic texture memory interpolation (only used when textures are not used)
//-------------------------------------------------------------------------------------------------------------
device float GPUInterpolate(const float * Vol,
float lx,
float ly,
float lz,
int sx,
int sy,
int sz)
{
float value = 0;
float alpha = 0;
float beta = 0;
float gamma = 0;
float floorx, floory, floorz;
//precalculate floor values
floorx = floor(lx);
floory = floor(ly);
floorz = floor(lz);
//get weights for each direction based on distance
alpha = lx-floorx;
beta = ly-floory;
gamma = lz-floorz;
//catch centre voxel requests and let value = voxel value and return it (faster)
if (alpha==0 && beta==0 && gamma==0)
{
value = Vol[int(lx+ly*sx+lz*sx*sy)];
return (value);
}
float xyz = 0;
float xxyz = 0;
float xyyz = 0;
float xyzz = 0;
float xxyyz = 0;
float xxyzz = 0;
float xyyzz = 0;
float xxyyzz = 0;
int px = 1;
int py = 1;
int pz = 1;
//catch edge requests and let them equal the edge (as they will have weight 0 anyway)
if (lz==(sz-1)) pz=0;
if (ly==(sy-1)) py=0;
if (lx==(sx-1)) px=0;
//get value of the 8 bounding voxels
xyz = Vol[int((floorx) + (floory*sx) + (floorz*sx*sy))]; //value at corner int voxel(x,y,z)
xxyz = Vol[int((floorx+px) + (floory*sx) + (floorz*sx*sy))]; //value at corner int voxel(x+1,y,z)
xyyz = Vol[int(floorx + ((floory+py)*sx) + (floorz*sx*sy))]; //value at corner int voxel(x,y+1,z)
xyzz = Vol[int(floorx + ((floory)*sx) + ((floorz+pz)*sx*sy))]; //value at corner int voxel(x,y,z+1)
xxyyz = Vol[int((floorx+px) + ((floory+py)*sx) + ((floorz)*sx*sy))]; //value at corner int voxel(x+1,y+1,z)
xxyzz = Vol[int((floorx+px) + ((floory)*sx) + ((floorz+pz)*sx*sy))]; //value at corner int voxel(x+1,y,z+1)
xyyzz = Vol[int(floorx + ((floory+py)*sx) + ((floorz+pz)*sx*sy))]; //value at corner int voxel(x,y+1,z+1)
xxyyzz = Vol[int((floorx+px) + ((floory+py)*sx) + ((floorz+pz)*sx*sy))]; //value at corner int voxel(x+1,y+1,z+1)
//get interpolated value of loaction based on weighted values of surrounding voxels
value = ((1-alpha)*(1-beta)*(1-gamma)*(xyz)) + (alpha*(1-beta)*(1-gamma)*(xxyz))
+ (beta*(1-alpha)*(1-gamma)*(xyyz)) + (alpha*beta*(1-gamma)*(xxyyz))
+ (gamma*(1-alpha)*(1-beta)*(xyzz)) + (alpha*gamma*(1-beta)*(xxyzz))
+ (beta*gamma*(1-alpha)*(xyyzz)) + (alpha*beta*gamma*(xxyyzz));
return (value);
}
//-----END of GPU INTERPOLATE MIMIC ---------------------------------------------------------------------------
//-------------------------------------------------------------------------------------------------------------
//-------------------------------------------------------------------------------------------------------------
// GPU TEST KERNEL BEGIN
//-------------------------------------------------------------------------------------------------------------
global void testkernel( float * GammaOUT,
float * DoseDiffOUT,
const float * VolRef,
float szXr,
float szYr,
float szZr,
float szXc,
float szYc,
float szZc)
{
// This is the threadIdx index counter
int idx = (threadIdx.x + (threadIdx.y * (blockDim.x)) + (threadIdx.z * (blockDim.x*blockDim.y))) + ( (blockIdx.x * blockDim.x * blockDim.y * blockDim.z)) + ( (blockIdx.y * gridDim.x * blockDim.x * blockDim.y * blockDim.z));
//initalise GammOUT / DoseDiffOUT and set Distance / Dose to 99999 to avoid resusing values
GammaOUT[idx] = 0;//99999;
DoseDiffOUT[idx] = 0;//99999;
float idxComDose = 0;//99999;
float idxRefDose = 0;//99999;
int irX = 0;
int irY = 0;
int irZ = 0;
int icX = 0;
int icY = 0;
int icZ = 0;
// Determine x,y,z from idx of the Reference vol
irX = floor(fmodf(idx,(szXr)));
irY = floor(fmodf((int)(idx) / (int)(szXr),szYr));
irZ = floor(fmodf((int)(idx) / (int)(szXr*szYr),szZr));
// Determine x,y,z from idx of the Compare vol
icX = fmodf(idx,(szXc));
icY = fmodf((int)(idx) / (int)(szXc),szYc);
icZ = fmodf((int)(idx) / (int)(szXc*szYc),szZc);
float offset= 0.1f;
//interpolate along x axis in steps of 0.1 between a 0 voxel and a 1 voxel
idxComDose = tex3D(VolComtex,2+0.5f+(offset*icX), icY+0.5f, icZ+0.5f);
idxRefDose = GPUInterpolate(VolRef,2+(offset*icX),irY,irZ,szXr,szYr,szZr);
//output the results
GammaOUT[idx] = idxComDose;
DoseDiffOUT[idx] = idxRefDose;
//these are just checks that i know the kernel actually ran
GammaOUT[0]=1;
DoseDiffOUT[1]=2;
}
//KERNEL END
//-------------------------------------------------------------------------------------------------------------
//-------------------------------------------------------------------------------------------------------------
//
/ MAIN MEX FUNCTION /
//
void mexFunction(int nlhs,mxArray *plhs,int nrhs,const mxArray *prhs)
{
// pointers for the variables
double * GammaOUT; // OUT 0
double * DoseDiffOUT; // OUT 1
double * VolRef; // IN 0
double * VolCom; // IN 1
float szXr; // IN 2
float szYr; // IN 3
float szZr; // IN 4
float szXc; // IN 5
float szYc; // IN 6
float szZc; // IN 7
// Vector data
VolRef = mxGetPr(prhs[0]); //0
VolCom = mxGetPr(prhs[1]); //1
// Value parameters
szXr = *(double*)(mxGetPr(prhs[2])); //2
szYr = *(double*)(mxGetPr(prhs[3])); //3
szZr = *(double*)(mxGetPr(prhs[4])); //4
szXc = *(double*)(mxGetPr(prhs[5])); //5
szYc = *(double*)(mxGetPr(prhs[6])); //6
szZc = *(double*)(mxGetPr(prhs[7])); //7
// allocate memory for output
mwSize dims[] = {szXr*szYr*szZr,1,1};
mwSize ndim = 3;
plhs[0] = mxCreateNumericArray( ndim, dims, mxDOUBLE_CLASS, mxREAL );
plhs[1] = mxCreateNumericArray( ndim, dims, mxDOUBLE_CLASS, mxREAL );
// get output/input pointers
GammaOUT = mxGetPr(plhs[0]);
DoseDiffOUT = mxGetPr(plhs[1]);
float *GOUT = (float *) mxMalloc((szXr*szYr*szZr)*sizeof(float));
float *DOUT = (float *) mxMalloc((szXr*szYr*szZr)*sizeof(float));
float *RefData = (float *) mxMalloc((szXr*szYr*szZr)*sizeof(float));
float *CompData = (float *) mxMalloc((szXc*szYc*szZc)*sizeof(float));
for (int i = 0; i < (int)(szXr*szYr*szZr); i++)
RefData[i] = (float)(VolRef[i]);
for (int i = 0; i < (szXc*szYc*szZc); i++)
CompData[i] = (float)(VolCom[i]);
cudaSetDevice(0);
//create a cudaExtent, describing a Volume for a cudaArray which will be a texture;
cudaExtent ComVolumeSize = make_cudaExtent(szXc, szYc, szZc);
//create a channel descriptor that describes variables pulled from texture mem
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
//allocate mem for the 3D cudaArray
cutilSafeCall(cudaMalloc3DArray(&Vol3D2_cuArray, &channelDesc, ComVolumeSize));
size_t ComSize = ComVolumeSize.width* ComVolumeSize.height *ComVolumeSize.depth;
//set copy parameters and copy Volumes to Array
cudaMemcpy3DParms ComCopyParams = {0};
ComCopyParams.srcPtr = make_cudaPitchedPtr((void*)CompData,ComVolumeSize.width*sizeof(float), ComVolumeSize.width, ComVolumeSize.height);
ComCopyParams.dstArray = Vol3D2_cuArray;
ComCopyParams.extent = ComVolumeSize;
ComCopyParams.kind = cudaMemcpyHostToDevice;
cutilSafeCall( cudaMemcpy3D(&ComCopyParams) );
//specify texture reference parameters
VolComtex.normalized = false;
VolComtex.filterMode = cudaFilterModeLinear;
VolComtex.addressMode[0] = cudaAddressModeClamp;
VolComtex.addressMode[1] = cudaAddressModeClamp;
VolComtex.addressMode[2] = cudaAddressModeClamp;
//Bind the array to the texture references
cutilSafeCall(cudaBindTextureToArray(VolComtex, Vol3D2_cuArray, channelDesc));
//declare variables / pointers for GPU
size_t GSize = (szXr)*(szYr)*(szZr)*sizeof(float);
float *GGVol, *GGDose, *GVolume3D1;
// allocate memory on GPU
cutilSafeCall(cudaMalloc((void**)&GGVol, GSize));
cutilSafeCall(cudaMalloc((void**)&GGDose, GSize));
cutilSafeCall(cudaMalloc((void**)&GVolume3D1, GSize));
//copy variables from host memory to device memory
cutilSafeCall(cudaMemcpy(GGVol, GammaOUT, GSize, cudaMemcpyHostToDevice));
cutilSafeCall(cudaMemcpy(GGDose, DoseDiffOUT, GSize, cudaMemcpyHostToDevice));
cutilSafeCall(cudaMemcpy(GVolume3D1, RefData, GSize, cudaMemcpyHostToDevice));
//define grid and block size and then invoke GPU kernel (manually or based on dimensions of volume)
int BlockX = 10;
int BlockY = 10;
int GridX = 10;//128;
int GridY = 1;//(szXc*szYc*szZc)/(BlockX*BlockY*GridX);
dim3 dimBlock(BlockX,BlockY);
dim3 dimGrid(GridX,GridY);
//sync threads
cudaThreadSynchronize();
//call GPU Kernel
testkernel<<<dimGrid,dimBlock>>>(GGVol,GGDose,GVolume3D1,szXr,szYr,szZr,szXc,szYc,szZc);
//sync threads (wait until all kernel threads are complete)
cudaThreadSynchronize();
//allocate memory on host and copy variables from DEVICE to HOST
cudaMemcpy(GOUT, GGVol, GSize, cudaMemcpyDeviceToHost);
cudaMemcpy(DOUT, GGDose, GSize, cudaMemcpyDeviceToHost);
for (int i = 0; i< (szXr*szYr*szZr); i++)
{
GammaOUT[i] = (double)(GOUT[i]);
DoseDiffOUT[i] = (double)(DOUT[i]);
}
//free Device Memory
cutilSafeCall(cudaFree(GGVol));
cutilSafeCall(cudaFree(GGDose));
cutilSafeCall(cudaFree(GVolume3D1));
//sync threads
cudaThreadSynchronize();
//free Host Memory
mxFree(GOUT);
mxFree(DOUT);
mxFree(RefData);
mxFree(CompData);
}