How to pass the answers after applying tex3d ?

I am trying to use texture reference API by using tex3d. But I am unable to get its value after all calculations. Last, of the program, I am printing all values, but there no values are printing. Can you help me to get all values after kernel calculations?

#include<stdio.h>
#include<cuda.h>

texture<float,cudaTextureType3D,cudaReadModeElementType> texreference;

__global__ void kernel(float* dmatrix, int size)
{
	int loop;
	int xindex;
	int yindex;
	int zindex;
	
	// calculate each thread global index
	xindex=blockIdx.x*blockDim.x+threadIdx.x;
	yindex=blockIdx.y*blockDim.y+threadIdx.y;
	
	for (loop=0;loop<size;loop++)
	{
		zindex=loop;
	
		// fetch cuda array through texture reference
		dmatrix[zindex*size*size + yindex*size+xindex]=
			tex3D(texreference,xindex,yindex,zindex);
			printf("output = %f\n",dmatrix[zindex*size*size + yindex*size+xindex]);
	}
	return;
}

int main(int argc, char** argv)
{
	int size=8;

	dim3 blocknum;
	dim3 blocksize;
	
	float* hmatrix;
	float* dmatrix;
	
	cudaArray* cudaarray;
	cudaExtent volumesize;
	cudaChannelFormatDesc channel;
	
	cudaMemcpy3DParms copyparms={0};
	
	// allocate host and device memory
	hmatrix=(float*)malloc(sizeof(float)*size*size*size);
	cudaMalloc((void**)&dmatrix,sizeof(float)*size*size*size);
	
	// initialize host array before usage
	for(int loop=0; loop<size*size*size;loop++)
	{
		hmatrix[loop]= loop ;
		printf("%f\n", hmatrix[loop] );
	}

	// set cuda array volume size
	volumesize=make_cudaExtent(size,size,size);	
		
	// create channel to describe data type
	channel=cudaCreateChannelDesc<float>();	

	// allocate device memory for cuda array
	cudaMalloc3DArray(&cudaarray,&channel,volumesize);

	// set cuda array copy parameters
	copyparms.extent=volumesize;
	copyparms.dstArray=cudaarray;
	copyparms.kind=cudaMemcpyHostToDevice;
	
	copyparms.srcPtr= make_cudaPitchedPtr((void*)hmatrix,sizeof(float)*size,size,size);
	
	cudaMemcpy3D(&copyparms);

	// set texture filter mode property
	// use cudaFilterModePoint or cudaFilterModeLinear
	texreference.filterMode=cudaFilterModePoint;

	// set texture address mode property
	// use cudaAddressModeClamp or cudaAddressModeWrap
	texreference.addressMode[0]=cudaAddressModeWrap;
	texreference.addressMode[1]=cudaAddressModeWrap;
	texreference.addressMode[2]=cudaAddressModeClamp;

	// bind texture reference with cuda array
	cudaBindTextureToArray(texreference,cudaarray,channel);

	
	blocksize.x=8;
	blocksize.y=8;
	blocksize.z=8;
	
	blocknum.x=(int)ceil((float)size/8);
	blocknum.y=(int)ceil((float)size/8);
	blocknum.z=(int)ceil((float)size/8);
	
		
	// execute device kernel
	kernel<<<blocknum,blocksize>>>(dmatrix,size);
	
	// unbind texture reference to free resource
	cudaUnbindTexture(texreference);
	
	// copy result array from device to host memory
	const int bytes = sizeof(float)*size*size*size;
	cudaMemcpy(hmatrix,dmatrix,bytes,cudaMemcpyDeviceToHost);
	
	// free host and device memory

	cudaFree(dmatrix);
	cudaFreeArray(cudaarray);
	
	for(int loop=0; loop<size*size*size;loop++)
		printf("%f\n", hmatrix[loop] );
   
	free(hmatrix);
	
  	
	return 0;
}

This is my code which I am using.
I am printing values from code line numbers 113. this all is same as previous as input.

add proper CUDA error checking to your code. (google that, if need be)
run your code with cuda-memcheck

Do these things before asking others for help.