Odd behavior when reading from texture Texture and Surface memory am I missing something

Hello all,

I want to write a simple program that solves the stationary laplace equation using texture memory to do the interpolation on the interior grid points for me, I’m rather lazy. The program I have written has several problems which are probably trivial and stem from my lack of understanding of CUDA and an unfamiliarly with pointers in C/C++. I also am trying to use surface references to store the results of my texture reads. My attempt is below along with what I think is rather strange output

//surf_tex_laplace.cu

/*

A simple Laplace solver that uses the bilinear interpolation feature of texture memory

to average grid elements simply by reading

*/

#include <stdio.h>

#include <cuda.h>

#include <stdlib.h>

#include <time.h>

#define nx 3

#define ny 3

//Boring stuff

time_t prog_start = time (NULL);

int i, j, k;

//Create two 2D texture references to our domain (reading)

texture<float, 2>  texOne;

//Create two 2D surfaces reference to our domain (writing)

surface<void, 2> surfOne;

//Get CUDA errors, taken from drdobbs.com

void checkCUDAError(const char *msg) {

  cudaError_t err = cudaGetLastError();

  if( cudaSuccess != err) {

    fprintf(stderr, "Cuda error: %s: %s.\n", msg, cudaGetErrorString( err) ); 

    exit(EXIT_FAILURE); 

  }

} 

//Return total error in arrays

float error_est(float currentArray[][ny+2], float previousArray[][ny+2]){

	float err = 0.0;

		for (i = 1; i <=nx; i++ ){

			for (j = 1; j <= ny;  j++){

				err += currentArray[i][j] - previousArray[i][j];

				previousArray[i][j] = currentArray[i][j];

			}

		}

	if (err > 0){

		return err;

	}

	else if (err < 0){

		return -err;

	}	

	else

		return err;

}	 

//Populate domain with initial values

void InitializeDomain(float theArray[][ny+2], int SizeX, int SizeY, float left, float right, float top, float bottom) {

	for (i = 0; i < SizeX; i++)

   	for (j = 0; j < SizeY; j++)

   		if (i == 0 && j > 0 && j != (SizeY-1) )

   			theArray[i][j] = top;

   		else if (j == 0 && i > 0 && i != (SizeX-1) )

   			theArray[i][j] = left;

   		else if (i == (SizeX-1) && j > 0 && j < (SizeY-1))

   			theArray[i][j] = bottom;

   		else if (j == (SizeY-1) && i > 0 && i < (SizeX-1))

   			theArray[i][j] = right;

   		else

   			//theArray[i][j] = abs(0.25*(top+bottom+left+right)*sin(float(i))*sin(float(j)));

   			theArray[i][j] = 0.25*(top+bottom+left+right);

} 

void DebugDomain(float currentArray[][ny+2]) {

	for (i = 0; i <= nx+1; i++){

   	for (j = 0; j <= ny+1; j++){

			printf("%f \t", currentArray[i][j]);

		}

		printf(" \n");

	}

} 

//Our very complicated kernel

__global__ void averagatron( ) 

{

	// calculate surface coordinates

	unsigned int x = (blockIdx.x*blockDim.x + threadIdx.x) + 1;

	unsigned int y = (blockIdx.y*blockDim.y + threadIdx.y) + 1;

	float u = ( x / (float)(nx+2) );

	float v = ( y / (float)(ny+2) );

	// read from texture memory and write to a CUDA array (via surface reference)

	if (x <= nx && y <= ny){

			float data = tex2D(texOne, u, v);

			surf2Dwrite(data, surfOne, x*sizeof(float), y, cudaBoundaryModeClamp);

	}

}

// Host code

int main(int argc, char** argv)

{

//Boring Stuff

float ERROR = 10000.00*(float)nx*(float)ny;

float TOL = 0.0001*(float)nx*(float)ny;

//Create our domain and populate the initial values

float domainOne[nx+2][ny+2];

float domainTwo[nx+2][ny+2];

InitializeDomain(domainOne, nx+2, ny+2, 0.0, 100.0, 0.0, 100.0);

InitializeDomain(domainTwo, nx+2, ny+2, 0.0, 100.0, 0.0, 100.0);

DebugDomain(domainOne);

//Start GPU Timer

//Taken from NVIDIA programming guide

cudaEvent_t start, stop; 

float gpu_time; 

cudaEventCreate(&start); 

cudaEventCreate(&stop); 

cudaEventRecord( start, 0 );

checkCUDAError("Error Initializing Timer");

//Create the device's copy of our domain

cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);

cudaArray* cuDomainOne;

cudaMallocArray(&cuDomainOne, &channelDesc, nx+2, ny+2, cudaArraySurfaceLoadStore);

//Define Texture Parameters

texOne.addressMode[0] = cudaAddressModeClamp;

texOne.addressMode[1] = cudaAddressModeClamp;

texOne.filterMode = cudaFilterModeLinear;

texOne.normalized = true;

//Bind the surface references to our host domain

cudaBindSurfaceToArray(surfOne, cuDomainOne);

checkCUDAError("Failure Creating Surface Reference");

//Bind the texture references to our host domain

cudaBindTextureToArray(texOne, cuDomainOne, channelDesc);

checkCUDAError("Failure Creating Texture Reference");

//Copy host domain to device domain 

cudaMemcpy2DToArray( cuDomainOne, 0, 0, domainOne, (nx+2)*sizeof(float), (nx+2)*sizeof(float), ny+2, cudaMemcpyHostToDevice);

checkCUDAError("Failure to Initialize Device Domain");

// Invoke kernel

int NUM_THREADS = nx*ny;

int THREADS_PER_BLOCK = 64;	

int nBlocks = NUM_THREADS/THREADS_PER_BLOCK + ((NUM_THREADS % THREADS_PER_BLOCK)?1:0);

dim3 dimBlock(8, 8);

for (k = 0; k < 100000 && ERROR > TOL; k++){

	

	//Execute kernel

	averagatron<<<nBlocks, dimBlock>>>( );

	checkCUDAError("Failure to Execute Kernel");

	

	cudaMemcpy2DFromArray( domainOne, (nx+2)*sizeof(float), cuDomainOne, 0, 0, (nx+2)*sizeof(float), ny+2, cudaMemcpyDeviceToHost);

	checkCUDAError("Failure to Return Data from Device to Domain One");

	

	ERROR = error_est(domainOne, domainTwo);

	

	

}

cudaThreadSynchronize();

cudaEventRecord( stop, 0 );

cudaEventElapsedTime( &gpu_time, start, stop );

checkCUDAError("Error Stopping Timer");

time_t prog_end = time (NULL);

double cpu_time = difftime(prog_end, prog_start);

printf("GPU time: %f (ms)\n", time);

printf("CPU time: %f (s)\n", cpu_time);

//printf("\n");

DebugDomain(domainOne);

FILE *file;

file = fopen("solution.txt","w");

for (i=0; i < nx+2; i++){

			for (j=0; j < ny+2; j++){

				fprintf(file,"%f \t",domainTwo[i][j]);	

			}

				fprintf(file, " \n");		

		}

fclose(file);

//Clean up

cudaEventDestroy( start ); 

cudaEventDestroy( stop );

checkCUDAError("Failure to Destroy Timer");

cudaUnbindTexture(texOne);

checkCUDAError("Failure to Unbind Texture");

cudaFreeArray(cuDomainOne);

checkCUDAError("Failure to Free Device Memory");

//free(domainOne);

//free(domainTwo);

return 0;

}

Here’s the output

50.000000       0.000000        0.000000        0.000000        50.000000        

0.000000        50.000000       50.000000       50.000000       100.000000       

0.000000        50.000000       50.000000       50.000000       100.000000       

0.000000        50.000000       50.000000       50.000000       100.000000       

50.000000       100.000000      100.000000      100.000000      50.000000        

GPU time: 0.000000 (ms)

CPU time: 0.000000 (s)

50.000000       0.000000        0.000000        0.000000        50.000000        

0.000000        16.666668       5.555556        1.851853        100.000000       

0.000000        5.555556        9.259262        5.555575        100.000000       

0.000000        1.851853        5.555575        6.790281        100.000000       

50.000000       100.000000      100.000000      100.000000      50.000000

The first array is the initial conditions, I set the interior points to the average of the boundaries, the second array is “solution” which I simply don’t believe is right. Also my timing information is completely wrong. Any help with either of these two issues would be greatly appreciated.

Change the order of synchronization:
cudaEventRecord( stop, 0 );
cudaThreadSynchronize();

This will help you get the right timing. EventRecord needs synchronization before you can get the correct result.

as for the result… eh… it’ll take some time for me to follow your code. Maybe you could trim it a bit first?

Thanks for replying hywneuron,

calling [font=“Courier New”]cudaThreadSynchronize();[/font] [font=“Courier New”]before cudaEventRecord( stop, 0 );[/font] seems to have no effect on the output of my program. Below is a trimmer version of my program

//Create two 2D texture references to our domain (reading)

texture<float, 2>  texOne;

//Create two 2D surfaces reference to our domain (writing)

surface<void, 2> surfOne;

//Populate domain with initial values

void InitializeDomain(float theArray[][ny+2], int SizeX, int SizeY, float left, float right, float top, float bottom) {

	for (i = 0; i < SizeX; i++)

   	for (j = 0; j < SizeY; j++)

   		if (i == 0 && j > 0 && j != (SizeY-1) )

   			theArray[i][j] = top;

   		else if (j == 0 && i > 0 && i != (SizeX-1) )

   			theArray[i][j] = left;

   		else if (i == (SizeX-1) && j > 0 && j < (SizeY-1))

   			theArray[i][j] = bottom;

   		else if (j == (SizeY-1) && i > 0 && i < (SizeX-1))

   			theArray[i][j] = right;

   		else

   			//theArray[i][j] = abs(0.25*(top+bottom+left+right)*sin(float(i))*sin(float(j)));

   			theArray[i][j] = 0.25*(top+bottom+left+right);

} 

//Our very complicated kernel

__global__ void averagatron( ) 

{

	// calculate surface coordinates

	unsigned int x = (blockIdx.x*blockDim.x + threadIdx.x) + 1;

	unsigned int y = (blockIdx.y*blockDim.y + threadIdx.y) + 1;

	float u = ( x / (float)(nx+2) );

	float v = ( y / (float)(ny+2) );

	// read from texture memory and write to a CUDA array (via surface reference)

	if (x <= nx && y <= ny){

			float data = tex2D(texOne, u, v);

			surf2Dwrite(data, surfOne, x*sizeof(float), y, cudaBoundaryModeClamp);

	}

}

// Host code

int main(int argc, char** argv)

{

//Boring Stuff

float ERROR = 10000.00*(float)nx*(float)ny;

float TOL = 0.0001*(float)nx*(float)ny;

//Create our domain and populate the initial values

float domainOne[nx+2][ny+2];

float domainTwo[nx+2][ny+2];

InitializeDomain(domainOne, nx+2, ny+2, 0.0, 100.0, 0.0, 100.0);

InitializeDomain(domainTwo, nx+2, ny+2, 0.0, 100.0, 0.0, 100.0);

//Start GPU Timer

//Taken from NVIDIA programming guide

cudaEvent_t start, stop; 

float gpu_time; 

cudaEventCreate(&start); 

cudaEventCreate(&stop); 

cudaEventRecord( start, 0 );

checkCUDAError("Error Initializing Timer");

//Create the device's copy of our domain

cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);

cudaArray* cuDomainOne;

cudaMallocArray(&cuDomainOne, &channelDesc, nx+2, ny+2, cudaArraySurfaceLoadStore);

//Define Texture Parameters

texOne.addressMode[0] = cudaAddressModeClamp;

texOne.addressMode[1] = cudaAddressModeClamp;

texOne.filterMode = cudaFilterModeLinear;

texOne.normalized = true;

//Bind the surface references to our host domain

cudaBindSurfaceToArray(surfOne, cuDomainOne);

checkCUDAError("Failure Creating Surface Reference");

//Bind the texture references to our host domain

cudaBindTextureToArray(texOne, cuDomainOne, channelDesc);

checkCUDAError("Failure Creating Texture Reference");

//Copy host domain to device domain 

cudaMemcpy2DToArray( cuDomainOne, 0, 0, domainOne, (nx+2)*sizeof(float), (nx+2)*sizeof(float), ny+2, cudaMemcpyHostToDevice);

checkCUDAError("Failure to Initialize Device Domain");

// Invoke kernel

int NUM_THREADS = nx*ny;

int THREADS_PER_BLOCK = 64;	

int nBlocks = NUM_THREADS/THREADS_PER_BLOCK + ((NUM_THREADS % THREADS_PER_BLOCK)?1:0);

dim3 dimBlock(8, 8);

for (k = 0; k < 100000 && ERROR > TOL; k++){

	

	//Execute kernel

	averagatron<<<nBlocks, dimBlock>>>( );

	checkCUDAError("Failure to Execute Kernel");

	

	cudaMemcpy2DFromArray( domainOne, (nx+2)*sizeof(float), cuDomainOne, 0, 0, (nx+2)*sizeof(float), ny+2, cudaMemcpyDeviceToHost);

	checkCUDAError("Failure to Return Data from Device to Domain One");

	

	ERROR = error_est(domainOne, domainTwo);

	

	

}

cudaThreadSynchronize();

cudaEventRecord( stop, 0 );

cudaEventElapsedTime( &gpu_time, start, stop );

checkCUDAError("Error Stopping Timer");

time_t prog_end = time (NULL);

double cpu_time = difftime(prog_end, prog_start);

printf("GPU time: %f (ms)\n", time);

printf("CPU time: %f (s)\n", cpu_time);

//printf("\n");

DebugDomain(domainOne);

FILE *file;

file = fopen("solution.txt","w");

for (i=0; i < nx+2; i++){

			for (j=0; j < ny+2; j++){

				fprintf(file,"%f \t",domainTwo[i][j]);	

			}

				fprintf(file, " \n");		

		}

fclose(file);

//Clean up

cudaEventDestroy( start ); 

cudaEventDestroy( stop );

checkCUDAError("Failure to Destroy Timer");

cudaUnbindTexture(texOne);

checkCUDAError("Failure to Unbind Texture");

cudaFreeArray(cuDomainOne);

checkCUDAError("Failure to Free Device Memory");

//free(domainOne);

//free(domainTwo);

return 0;

}

I guess the most important part is the addressing issue, are CUDA C arrays addressed in row major or column major order?

Caleb

To get timing correctly, you need to add calls to cudaEventSynchronize() after each cudaEventRecord(). Also notice that you’re not passing the correct argument to printf when printing GPU time and you’re not likely to see anything other than 0 in the second printf (because the function you use to measure CPU time, time(), has granularity of 1 second.)

Now, what exactly is happening in your kernel? Consider the stable state or the “solution”. Take the position x=1,y=1. Look at the programming guide, section F.2.

The texture fetcher calculates x_B = 0.5, y_B = 0.5. Then it fetches T[0,0]=50, T[0,1]=0, T[1,0]=0, T[1,1]=16.66, averages, arrives at 16.66 and stores it in T[1,1].

I don’t think you can do what you want with textures. You have to do the averaging manually.

If you redefine the texture as “filterMode = cudaFilterModePoint, normalized = false” and the kernel as follows

__global__ void averagatron( ) 

{

        // calculate surface coordinates

        unsigned int x = (blockIdx.x*blockDim.x + threadIdx.x) + 1;

        unsigned int y = (blockIdx.y*blockDim.y + threadIdx.y) + 1;

// read from texture memory and write to a CUDA array (via surface reference)

        if (x <= nx && y <= ny){

                        float data = (tex2D(texOne, x, y-1) + tex2D(texOne, x, y+1) + tex2D(texOne, x-1, y) + tex2D(texOne, x+1, y))/4;

                        surf2Dwrite(data, surfOne, x*sizeof(float), y, cudaBoundaryModeClamp);

        }

}

the solution will look a lot more plausible.

@hamster143,

Thank you for the help with the timing information; I did need another [font=“Courier New”]cudaThreadSynchronize(); [/font] after starting my timer, and thanks for catching my typo on the [font=“Courier New”]printf[/font]. When fetching an array element from Texture Memory, the GPU itself can perform low level, linear, interpolation using the elements nearest neighbors which has apperently been branded “texture filtering”, see this article by Rob Farber. I am also fairly confident that every element in my CUDA array is being read since the interior elements have all been modified by the end of the program. I am using normalized coordinates to address the elements of my array which may have appeared to only sample the top left corner of the array. I still think that my problem lies in a subtle issue of addressing, so I’ll keep looking down that avenue.

Caleb

It can perform interpolation between 4 elements adjacent to each other. But that’s not what you need to solve your problem.

@hamster143,

I do need to perform averaging over the four nearest neighbors, you can see a more detailed explanation of my problem here. In order to solve the Laplace equation on a 2D rectangular domain you essentially break up the domain into little squares the solution at time t at point (x,y) can be numerically obtained by averaging the solution at t-1 at points (x-1,y), (x+1,y), (x, y-1), (x, y+1) which is essentially texture filtering.

Caleb

No, texture filtering performs averaging of points (x-1,y-1), (x-1,y), (x,y-1), (x,y).

I think that should still work but you end up with the solution for the point (x - 0.5, y - 0.5). One way of thinking about it is to imagine another grid with spacing that is smaller by a factor of root 2 and that is rotated by 45 degrees.

@hamster123 You are entirely correct that the way texture filtering is performed my idea will not work as intended.

@shawkie I was thinking along similar lines and I have modified my initial attempt, now I first embed the points I want to sample within a larger grid so imagine that the initial grid has numbers placed into cells on graph paper, I sample from the points at which the cells intersect so that the filtering averages over the correct cells. I accomplish this by calling tex2D with simple integer arguments. I then store these interpolated points in a temporary CUDA array and in another kernel I perform the same interpolation on the points in the temporary array and place these points in the interior of the initial grid and this constitutes one time step in the simulation. This should work on paper, however, I am still getting odd results. I have an initial grid with the following values

0.000000        0.000000        0.000000        0.000000        50.000000        

0.000000        25.000000       25.000000       25.000000       100.000000       

0.000000        25.000000       25.000000       25.000000       100.000000       

0.000000        25.000000       25.000000       25.000000       100.000000       

50.000000       100.000000      100.000000      100.000000      100.000000

What should be stored in the temporary array is

6.250000        12.500000       12.500000       43.750000             

12.500000       25.000000       25.000000       37.500000           

12.500000       25.000000       25.000000       37.500000            

43.750000       62.500000       62.500000       81.250000

but instead I get

6.250000        12.500000       12.500000       43.750000        

0.000000        50.000000       50.000000       50.000000        

50.000000       100.000000      100.000000      100.000000       

0.000000        0.000000        0.000000        0.000000

The program that I am using for debugging is reproduced below, I hard code the values in the arrays the first time through so there’s no ambiguity

//surf_tex_laplace.cu

/*

A simple Laplace solver that uses the bilinear interpolation feature of texture memory

to average grid elements simply by reading

*/

#include <stdio.h>

#include <cuda.h>

#include <stdlib.h>

#define nx 3

#define ny 3

#define NUM_THREADS 256

//Boring stuff

int i, j, k;

//Create two 2D texture references to our domain (reading)

texture<float, 2>  texGrid;

texture<float, 2>  texTemp;

//Create two 2D surfaces reference to our domain (writing)

surface<void, 2> surfGrid;

surface<void, 2> surfTemp;

void checkCUDAError(const char *msg) {

  cudaError_t err = cudaGetLastError();

  if( cudaSuccess != err) {

    fprintf(stderr, "Cuda error: %s: %s.\n", msg, cudaGetErrorString( err) ); 

    exit(EXIT_FAILURE); 

  }

} 

void DebugDomain(float currentArray[][ny+2]) {

	for (i = 0; i < nx+2; i++){

   	for (j = 0; j < ny+2; j++){

			printf("%f \t", currentArray[i][j]);

		}

		printf(" \n");

	}

		printf(" \n");

} 

void DebugTempArray(float currentArray[][ny+1]) {

	for (i = 0; i < nx+1; i++){

   	for (j = 0; j < ny+1; j++){

			printf("%f \t", currentArray[i][j]);

		}

		printf(" \n");

	}

		printf(" \n");

} 

//Our very complicated kernels

__global__ void read_domain() 

{

	// calculate surface coordinates

	unsigned int x = (blockIdx.x*blockDim.x + threadIdx.x) + 1;

	unsigned int y = (blockIdx.x*blockDim.x + threadIdx.y) + 1;

	float u = ( float(x) );

	float v = ( float(y) );

			float data = tex2D(texGrid, u, v);

			surf2Dwrite(data, surfTemp, (x-1)*sizeof(float), (y-1), cudaBoundaryModeTrap);

}

__global__ void update_domain() 

{

	// calculate surface coordinates

	unsigned int x = (blockIdx.x*blockDim.x + threadIdx.x) + 1;

	unsigned int y = (blockIdx.x*blockDim.x + threadIdx.y) + 1;

	float u = ( float(x) );

	float v = ( float(y) );

			float data = tex2D(texTemp, u, v);

			surf2Dwrite(data, surfGrid, x*sizeof(float), y, cudaBoundaryModeTrap);

}

// Host code

int main(int argc, char** argv)

{

float Domain[nx+2][ny+2]; 

float TempGrid[nx+1][ny+1]; 

//Hard code array values for testing 

Domain[0][0] = 0;

Domain[0][1] = 0;

Domain[0][2] = 0;

Domain[0][3] = 0;

Domain[0][4] = 50;

Domain[1][0] = 0;

Domain[1][1] = 25;

Domain[1][2] = 25;

Domain[1][3] = 25;

Domain[1][4] = 100;

Domain[2][0] = 0;

Domain[2][1] = 25;

Domain[2][2] = 25;

Domain[2][3] = 25;

Domain[2][4] = 100;

Domain[3][0] = 0;

Domain[3][1] = 25;

Domain[3][2] = 25;

Domain[3][3] = 25;

Domain[3][4] = 100;

Domain[4][0] = 50;

Domain[4][1] = 100;

Domain[4][2] = 100;

Domain[4][3] = 100;

Domain[4][4] = 100;

TempGrid[0][0] = -1 ;

TempGrid[0][1] = -1 ;

TempGrid[0][2] = -1 ;

TempGrid[0][3] = -1 ;

TempGrid[1][0] = -1 ;

TempGrid[1][1] = -1 ;

TempGrid[1][2] = -1 ;

TempGrid[1][3] = -1 ;

TempGrid[2][0] = -1 ;

TempGrid[2][1] = -1 ;

TempGrid[2][2] = -1 ;

TempGrid[2][3] = -1 ;

TempGrid[3][0] = -1 ;

TempGrid[3][1] = -1 ;

TempGrid[3][2] = -1 ;

TempGrid[3][3] = -1 ;

DebugDomain(Domain);

//Create the device's copy of our domain

cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);

cudaArray* cuDomain;

cudaMallocArray(&cuDomain, &channelDesc, nx+2, ny+2, cudaArraySurfaceLoadStore);

cudaArray* cuTempGrid;

cudaMallocArray(&cuTempGrid, &channelDesc, nx+1, ny+1, cudaArraySurfaceLoadStore);

checkCUDAError("Failure Allocating Device Memory");

//Define Texture Parameters

texGrid.addressMode[0] = cudaAddressModeClamp;

texGrid.addressMode[1] = cudaAddressModeClamp;

texGrid.filterMode = cudaFilterModeLinear;

//texGrid.normalized = false;

texTemp.addressMode[0] = cudaAddressModeClamp;

texTemp.addressMode[1] = cudaAddressModeClamp;

texTemp.filterMode = cudaFilterModeLinear;

//texTemp.normalized = false;

checkCUDAError("Failure Setting Texture Properties");

//Bind the surface references to our host domain

cudaBindSurfaceToArray(surfGrid, cuDomain);

cudaBindSurfaceToArray(surfTemp, cuTempGrid);

checkCUDAError("Failure Creating Surface Reference");

//Bind the texture references to our host domain

cudaBindTextureToArray(texGrid, cuDomain, channelDesc);

cudaBindTextureToArray(texTemp, cuTempGrid, channelDesc);

checkCUDAError("Failure Creating Texture Reference");

//Copy host domain to device domain 

cudaMemcpy2DToArray( cuDomain, 0, 0, Domain, (nx+2)*sizeof(float), (ny+2)*sizeof(float), ny+2, cudaMemcpyHostToDevice);

checkCUDAError("Failure to Initialize Device Domain");

	

//Execute kernel

	

read_domain<<<1, 4, 4>>>();

cudaMemcpy2DFromArray( TempGrid, (nx+1)*sizeof(float), cuTempGrid, 0, 0, (ny+1)*sizeof(float), (ny+1), cudaMemcpyDeviceToHost);

checkCUDAError("Failure to Return Temporary Array");

DebugTempArray(TempGrid);

/*

	update_domain<<<1, 3, 3>>>();

	cudaThreadSynchronize();

	checkCUDAError("Failure to advance time");

*/

//Copy data from device to host

	

cudaMemcpy2DFromArray( Domain, (nx+2)*sizeof(float), cuDomain, 0, 0, (ny+2)*sizeof(float), (ny+2), cudaMemcpyDeviceToHost);

checkCUDAError("Failure to Return Data from Device to Domain One");

return 0;

}

All help is greatly appreciated.

Caleb