Hi all. I am implementing a system response matrix in order to use it in a image reconstruction algorithm.
Briefly, each thread corresponds to a detector bin, and each thread computes the intersection of the ray(that goes from that thread to a beam source) with the x,y and z axis.
This is what I have done so far:
#include <stdlib.h>
#include <stdio.h>
#define blocksize 8
#define iiterations 0
#define jiterations 0
/*The plan will be: split the initial dimensions by 8 and work those 8 parts individually, i.e, I reconstruct the first 88x448 block, free the memory and then go on to the next step, however, I think I need to apply the OSEM after each block*/
__global__ void sysmat(float*intersectionsx,float xfocus,float yfocus, float zfocus, int xbin,int xbinsize,int ybin,int ybinsize, int zbin,int zbinsize,int detectorXDim,int detectorYDim, int projecoes,int detectorZDim, int iiterationsu,int jiterationsu,int angle)
{
//COMPUTE XFOCUS,YFOCUS AND ZFOCUS WITH ANGLE AND DISTANCES TO DETECTOR
int tx=threadIdx.x, ty=threadIdx.y,bx=blockIdx.x, by=blockIdx.y;
float x,y,z,t;
int idy=(ty+by*blocksize)+jiterationsu;
int idx=(tx+bx*blocksize)+iiterationsu;
//Calculo de u, v, w
float slopeVectorx=xfocus-idx*xbinsize+0.5;
float slopeVectory=yfocus-idy*ybinsize+0.5;
float slopeVectorz=zfocus;
__syncthreads();
//calculo dos pontos em que o raio intersecta o eixo xx
int xint=idx+1;
for(xint=xint; xint<detectorXDim/2;xint++){
x=(float)((float)xint*xbinsize);
t=(float)((x-(float)idx)/slopeVectorx);
y=(float)((float)idy+t*slopeVectory);
z=0+t*slopeVectorz;
intersectionsx[(((idx-iiterationsu)+blocksize*(idy-jiterationsu))*(detectorXDim/2)+(xint-1))*3]=x;
intersectionsx[(((idx-iiterationsu)+blocksize*(idy-jiterationsu))*(detectorXDim/2)+(xint-1))*3+1]=y;
intersectionsx[(((idx-iiterationsu)+blocksize*(idy-jiterationsu))*(detectorXDim/2)+(xint-1))*3+2]=z;
//thread 1,1 is going for the same positions as thread 3,0, have to work it out
__syncthreads();
}
}
void printMat(float*P,int uWP,int uHP){
int i,j;
for(i=0;i<uHP;i++){
printf("\n");
for(j=0;j<uWP;j++)
printf("%f ",P[i*uWP+j]);
}
}
__host__ void iteracaoealocagem(){
//so para tirar erros
int xbin=0,ybin=0,zbin=0,xbinsize=1,ybinsize=1,zbinsize=0,angle=0,detectorXDim=2*1408,detectorYDim=2*1792,detectorZDim=60,projecoes=0;
float xfocus=1408,yfocus=600,zfocus=60;
cudaError_t status;
int iiterationsu=iiterations;
int jiterationsu=jiterations;
dim3 dimGrid(11,56);//para um bloco do detector de 88x448
dim3 dimBlock(8,8,1);
//CPU ALLOCATION AND DEBUG
float *intersectionsx_h=(float*)malloc(dimGrid.x*dimGrid.y*dimBlock.x*dimBlock.y*3*1407*sizeof(float));//664Mb
if (intersectionsx_h == NULL)
printf ( "!!!! host memory allocation error (interx)\n");
//GPU ALLOCATION AND DEBUG
float *intersectionsx_d;
status=cudaMalloc((void**)&intersectionsx_d,dimGrid.x*dimGrid.y*dimBlock.x*dimBlock.y*3*1407*sizeof(float));
if (status != cudaSuccess)
printf ("!!!! device memory allocation error (interx)\n");
//KERNEL CALL - TERà QUE ESTAR DENTRO DE UM CICLO, aliás, acho que esta funçao toda tem que tar dentro de um ciclo
sysmat<<<dimGrid,dimBlock>>>(intersectionsx_d,xfocus,yfocus,zfocus, xbin, xbinsize,ybin,ybinsize,zbin,zbinsize,detectorXDim,detectorYDim,projecoes,detectorZDim,iiterationsu,jiterationsu,angle);
//COPY GPU RESULTS TO CPU AND DEBUG
status=cudaMemcpy(intersectionsx_h,intersectionsx_d,dimGrid.x*dimGrid.y*dimBlock.x*dimBlock.y*1407*3*sizeof(float),cudaMemcpyDeviceToHost);
if (status != cudaSuccess)
printf ("!!!! could not retrieve from GPU (interx)\n");
printf("\n Matriz de intersecoes de x \n");
printMat(intersectionsx_h,3,10);
//FREE MEMORY AND DEBUG
status=cudaFree(intersectionsx_d);
if (status != cudaSuccess)
printf ("!!!! device memory free error (interx)\n");
}
void print_func_attr(struct cudaFuncAttributes at)
{
printf("Constant memory in bytes: %lu\n", at.constSizeBytes);
printf("Local memory in bytes: %lu\n", at.localSizeBytes);
printf("Max Thread per Block: %d\n", at.maxThreadsPerBlock);
printf("Number of registers used: %d\n", at.numRegs);
printf("Shared memory in bytes: %lu\n", at.sharedSizeBytes);
}
int main(){
struct cudaFuncAttributes attr;
cudaFuncGetAttributes(&attr, sysmat);
print_func_attr(attr);
//penso que terá que estar aqui o ciclo for
//for(int i=1;i<(2*1792)/56;i+=56)
//for(int j=1;j<(1408)/11;j+=11)
//iteracaoealocagem(i, j);
iteracaoealocagem();
}
Basically, I have two questions, one concerning the utility of the GPU, the other concerning the output of my program. As for the former, given that the detector is HUGE(214082*1792), I cannot reconstruct a medical image in one go, so I am working with 8 equal parts of the detector, one at a time. So I will have to move a lot of data a lot of times between GPU and CPU, how do I know if in the end, I won’t have a slower algorithm compared to the CPU?
As for the latter, here is the memory output of my program:
Constant memory in bytes: 4205191
Local memory in bytes: 47418912120928
Max Thread per Block: 214023104
Number of registers used: 51
Shared memory in bytes: 0
So 47 TBytes of local memory…?!214023104 maximum threads per block?! Really, I have no words to describe it…
Any help will be appreciated External Image