Hello everyone,
I’m trying to make some image transformations with cuda, Radon Transform and Backprojection transform.
I already have a first program, but its still too slow…
I would like to know how to improve the time… I will talk about radon:
The critical point is that every pixel in output image is a line integral in the input image, and no line integral is equal to another. To do the line integral, there is an linear interpolation and a “sum += …”
The kernel has two sums situations, in one case the linear intepol is in X, the other, in Y
my program is:
__global__ void Kernel(float* output, float dt, float dtheta, int sizeImage, int nrays, int nangles)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
int j = blockIdx.y * blockDim.y + threadIdx.y;
float ini, delta, x, y, cumsum, tol, ctheta, stheta, ttheta, theta, t, X, Y, temp;
if((i < nangles) && (j < nrays))
{
theta = i*dtheta;
t = -1.0 + j*dt;
tol = 1.0/sqrtf(2);
ini = -tol;
delta = (float) sqrtf(2)/(sizeImage-1);
ctheta =cosf(theta);
stheta =sinf(theta);
ttheta =tanf(theta);
if(stheta < tol){
cumsum = 0;
for(Y = 0; Y < sizeImage; Y++){
y = ini + Y*delta;
x = (t/ctheta - y*ttheta);
X = (float)((x - ini)/delta);
if(X > -1 && X < sizeImage){
temp = tex2D(texRef, X + 0.5f, Y + 0.5f);
cumsum += temp;
}
}
output[j*nangles + i] = cumsum/fabsf(ctheta);
}
else{
cumsum = 0;
for(X = 0; X < sizeImage; X++){
x = ini + X*delta;
y = (t/stheta - x/ttheta);
Y = (int) floorf((y - ini)/delta);
if(Y > -1 && Y < sizeImage-1){
temp = tex2D(texRef, X + 0.5f, Y + 0.5f);
cumsum += temp;
}
}
output[j*nangles + i] = cumsum/fabsf(stheta);
}
}
}
void RadonWithTexture(const float* h_input, float* h_output, int sizeImage, int nrays, int nangles)
{
float* d_output;
float dt = 2.0/(nrays-1);
float dtheta = PI/(nangles-1);
int size = sizeImage*sizeImage*sizeof(float);
// Allocate CUDA array in device memory
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 0, 0, 0,cudaChannelFormatKindFloat);
cudaArray* cuArray;
cudaMallocArray(&cuArray, &channelDesc, sizeImage, sizeImage);
// Copy to device memory some data located at address h_data in host memory
cudaMemcpyToArray(cuArray, 0, 0, h_input, size , cudaMemcpyHostToDevice);
// Set texture parameters
texRef.addressMode[0] = cudaAddressModeWrap;
texRef.addressMode[1] = cudaAddressModeWrap;
texRef.filterMode = cudaFilterModeLinear;
// texRef.normalized = true;
// Bind the array to the texture reference
cudaBindTextureToArray(texRef, cuArray, channelDesc);
// Allocate GPU buffers for the output image ..
cudaMalloc(&d_output, sizeof(float) * nrays * nangles);
dim3 threadsPerBlock(16,16);
dim3 grid((nangles/threadsPerBlock.x) + 1, (nrays/threadsPerBlock.y) + 1);
Kernel<<<grid, threadsPerBlock>>>(d_output, dt, dtheta, sizeImage, nrays, nangles);
cudaDeviceSynchronize();
// Copy output vector from GPU buffer to host memory.
cudaMemcpy(h_output, d_output, sizeof(float) * nrays * nangles, cudaMemcpyDeviceToHost);
cudaFreeArray(cuArray);
cudaFree(d_output);
}
int main(int argc, char *argv[]) {
int i, j;
FILE *fp = fopen(argv[1], "r");
int sizeImage = atoi(argv[2]);
int nrays = atoi(argv[3]);
int nangles = atoi(argv[4]);
float *f;
float *radon;
radon = (float *)malloc(nangles*nrays*sizeof(float));
f = (float *)malloc(sizeImage*sizeImage*sizeof(float));
for (i = 0; i < sizeImage*sizeImage; i++)
fscanf(fp, "%f", &f[i]);
RadonWithTexture(f, radon, sizeImage, nrays, nangles);
for ( i = 0; i < nrays ; i++){
for(j=0 ; j<nangles; j++){
fprintf(stdout, "%f ", radon[(nrays-1-i)*nangles + (nangles-1-j)]);
}
fprintf(stdout, "\n");
}
free(radon);
free(f);
fclose(fp);
return 0;
}
I would like to know if there is any better way to accumulate the integral sum or to paralelize the code (any different memory that I should use or any mathpack like Thrust).
Thanks.