So in the following kernel, if I change the global declaration of ndevice to constant, the opencl program/kernel will not compile.
[codebox]
// FDK kernel
__kernel void kernel_fdk(
__global float *dev_vol,
__read_only image2d_t dev_img,
__constant float *dev_matrix,
__constant float4 *nrm,
__constant float4 *vol_offset,
__constant float4 *vol_pix_spacing,
__constant int4 *vol_dim,
__constant float2 *ic,
__constant int2 *img_dim,
__constant float *sad,
__constant float *scale,
__constant int4 *offset,
__global int4 *ndevice
){
uint i = get_global_id(0);
uint j = get_global_id(1);
uint k = get_global_id(2);
if (i >= (*ndevice).x || j >= (*ndevice).y || k >= (*ndevice).z)
return;
// Index row major into the volume
long vol_idx = i + (j * (*vol_dim).x) + (k * (*vol_dim).x * (*vol_dim).y);
vol_idx -= (*offset).w;
i += (*offset).x;
j += (*offset).y;
k += (*offset).z;
// Get volume value from global memory
float dev_vol_value = dev_vol[vol_idx];
// offset volume coords
float4 vp;
vp.x = (*vol_offset).x + (i * (*vol_pix_spacing).x); // Compiler should combine into 1 FMAD.
vp.y = (*vol_offset).y + (j * (*vol_pix_spacing).y); // Compiler should combine into 1 FMAD.
vp.z = (*vol_offset).z + (k * (*vol_pix_spacing).z); // Compiler should combine into 1 FMAD.
// matrix multiply
float4 ip;
ip.x = (dev_matrix[0] * vp.x) + (dev_matrix[1] * vp.y) + (dev_matrix[2] * vp.z) + dev_matrix[3];
ip.y = (dev_matrix[4] * vp.x) + (dev_matrix[5] * vp.y) + (dev_matrix[6] * vp.z) + dev_matrix[7];
ip.z = (dev_matrix[8] * vp.x) + (dev_matrix[9] * vp.y) + (dev_matrix[10] * vp.z) + dev_matrix[11];
// Change coordinate systems
ip.x = (*ic).x + ip.x / ip.z;
ip.y = (*ic).y + ip.y / ip.z;
// Get pixel location from 2D image
int2 pos;
pos.y = convert_int_rtn(ip.x);
pos.x = convert_int_rtn(ip.y);
// Clip against image dimensions
if (pos.x < 0 || pos.x >= (*img_dim).x || pos.y < 0 || pos.y >= (*img_dim).y)
return;
// Get pixel from texture memory
float4 voxel_data = read_imagef(dev_img, dev_img_sampler, pos);
// Dot product
float s = ((*nrm).x * vp.x) + ((*nrm).y * vp.y) + ((*nrm).z * vp.z);
// Conebeam weighting factor
s = (*sad) - s;
s = ((*sad) * (*sad)) / (s * s);
// Place it into the volume
dev_vol[vol_idx] = dev_vol_value + ((*scale) * s * voxel_data.x);
}
[/codebox]
The errors are as follows:
[codebox]
Build Log:
ptxas application ptx input, line 77; error : Illegal bank number: 11
ptxas application ptx input, line 90; error : Illegal bank number: 11
ptxas fatal : Ptx assembly aborted due to errors
error : Ptx compilation failed: gpu=‘sm_11’, device code=‘anonymous_jit_identity’
: Retrieving binary for ‘anonymous_jit_identity’, for gpu=‘sm_11’, usage mode=‘’
: Considering profile ‘compute_10’ for gpu=‘sm_11’ in ‘anonymous_jit_identity’
: Control flags for ‘anonymous_jit_identity’ disable search path
: Ptx binary found for ‘anonymous_jit_identity’, architecture=‘compute_10’
: Ptx compilation for ‘anonymous_jit_identity’, for gpu=‘sm_11’, ocg options=‘’
[/codebox]
Clearly this is a memory limit issue, but how can it be solved? Using less constant memory?