Multiple kernels overlaping output

Hi all again. I guess i have described what I doing several times by now(sorry for that). I am now on the right track but I have a problem. I have two .cu files, one that gives me the intersections of a ray with the xx axis and the other with the yy axis. Given that I cannot dynamically allocate memory with CUDA, I do not use all the space available in the output array, so in the end I have some “nan” values. However, if I run interx.cu and afterwards run intery.cu, where there were "nan"s, I will have values from the first file ran(interx.cu). I am using both Free and cudaFree so I am a bit confused about why these values are not deleted from execution to execution.

I had some more serious problems, that I overcame, but I cannot see where is the problem here. Thank you in advance ;)

#include <stdlib.h>

#include <stdio.h>

#define blocksize 8

#define iiterations 0

#define jiterations 0

/*The plan will be: split the initial dimensions by 8 and work those 8 parts individually, i.e, I reconstruct the first 88x112 block, free the memory and then go on to the next step, however, I think I need to apply the OSEM after each block*/

__global__ void sysmat(float*intersectionsx,float xfocus,float yfocus, float zfocus, int xbin,int xbinsize,int ybin,int ybinsize, int zbin,int zbinsize,int detectorXDim,int detectorYDim, int projecoes,int detectorZDim, int iiterationsu,int jiterationsu,int angle)

{

	//COMPUTE XFOCUS,YFOCUS AND ZFOCUS WITH ANGLE AND DISTANCES TO DETECTOR

	int tx=threadIdx.x, ty=threadIdx.y,bx=blockIdx.x, by=blockIdx.y;

	float x,y,z,t;

	int idy=(ty+by*blocksize)+jiterationsu;

	int idx=(tx+bx*blocksize)+iiterationsu;

	//Calculo de u, v, w

	float slopeVectorx=xfocus-(idx+0.5)*xbinsize;

	float slopeVectory=yfocus-(idy+0.5)*ybinsize;

	float slopeVectorz=zfocus;

	__syncthreads();

	

	//xx axis intersections

	int xint=idx+1;

	for(xint=xint; xint<=detectorXDim/2;xint++){

		x=(float)((float)xint*xbinsize);

		t=(float)((x-(float)(idx+0.5))/slopeVectorx);

		y=(float)((float)(idy+0.5)+t*slopeVectory);

		z=-17+t*slopeVectorz;

	

		intersectionsx[(((idx-iiterationsu)+88*(idy-jiterationsu))*(detectorXDim/2)+(xint-1))*3]=x;

		intersectionsx[(((idx-iiterationsu)+88*(idy-jiterationsu))*(detectorXDim/2)+(xint-1))*3+1]=y;

		intersectionsx[(((idx-iiterationsu)+88*(idy-jiterationsu))*(detectorXDim/2)+(xint-1))*3+2]=z;

		//__syncthreads();

	}

	

}

void printMat(float*P,int uWP,int uHP){

	int i,j,contador=0;

	for(i=0;i<uHP;i++){

		contador++;

		printf("\n");

		for(j=0;j<uWP;j++){

			printf("%f ",P[i*uWP+j]);

	}

	

	}

}

__host__ void iteracaoealocagem(){

	

	//test input

	int xbin=0,ybin=0,zbin=0,xbinsize=1,ybinsize=1,zbinsize=0,angle=0,detectorXDim=2*1408,detectorYDim=2*1792,detectorZDim=60,projecoes=0;

	float xfocus=1408,yfocus=600,zfocus=60;

	cudaError_t status;

	int iiterationsu=iiterations;

	int jiterationsu=jiterations;

	

	//	3584/32=112 => 112/8= 14		1408/16=88 => 88/8=11

	dim3 dimGrid(11,14);//para um bloco do detector de 88x112, isto faz com que o maximo de intersectionsx seja 150Mb e intersectionsy 400Mb

	dim3 dimBlock(8,8,1);

	//CPU ALLOCATION AND DEBUG

	float *intersectionsx_h=(float*)malloc(dimGrid.x*dimGrid.y*dimBlock.x*dimBlock.y*3*1408*sizeof(float));//166Mb

	if (intersectionsx_h == NULL) 

		printf ( "!!!! host memory allocation error (interx)\n");

	//GPU ALLOCATION AND DEBUG

	float *intersectionsx_d;

	

	status=cudaMalloc((void**)&intersectionsx_d,dimGrid.x*dimGrid.y*dimBlock.x*dimBlock.y*3*1408*sizeof(float));

	if (status != cudaSuccess) 

            printf ("!!!! device memory allocation error (interx)\n");

	//KERNEL CALL 

	sysmat<<<dimGrid,dimBlock>>>(intersectionsx_d,xfocus,yfocus,zfocus, xbin, xbinsize,ybin,ybinsize,zbin,zbinsize,detectorXDim,detectorYDim,projecoes,detectorZDim,iiterationsu,jiterationsu,angle);

	//COPY GPU RESULTS TO CPU AND DEBUG

	status=cudaMemcpy(intersectionsx_h,intersectionsx_d,dimGrid.x*dimGrid.y*dimBlock.x*dimBlock.y*1408*3*sizeof(float),cudaMemcpyDeviceToHost);

	if (status != cudaSuccess) 

            printf ("!!!! could not retrieve from GPU (interx)\n");

	printf("\n Matriz de intersecoes de x \n");

	printMat(intersectionsx_h,3,1046500);

	//FREE MEMORY AND DEBUG

	status=cudaFree(intersectionsx_d);

	if (status != cudaSuccess) 

		printf ("!!!! device memory free error (interx)\n");

	free(intersectionsx_h);

}

void print_func_attr(struct cudaFuncAttributes at)

{

        printf("Constant memory in bytes: %lu\n", at.constSizeBytes);

        printf("Local memory in bytes: %lu\n", at.localSizeBytes);

        printf("Max Thread per Block: %d\n", at.maxThreadsPerBlock);

        printf("Number of registers used: %d\n", at.numRegs);

        printf("Shared memory in bytes: %lu\n", at.sharedSizeBytes);

}

int main(){

	struct cudaFuncAttributes attr;

	cudaFuncGetAttributes(&attr, "_Z6sysmatPffffiiiiiiiiiiiii");

	print_func_attr(attr);

	iteracaoealocagem();

}

P.S This is the code for interx.cu, the other is almost the same, the only thing that changes is the for cycle inside the kernel

I guess i figured it out, if I pad the matrix with zeros, I believe it deals with the problem