I think I tried to overly simplify my problem in the explanation. I have copied data to the device from the host and allocated memory on the device for my results. As a test I justed wanted to copy the data I copied to the device to the memory allocated for a result on the device and then copy those results to the host. In the following kernel the data being moved is Gamma with size = 400*sizeof(float).

I send Gamma = 0.000000, -0.124347, 0.132042

and get back Cp = 0.000000, -0.140711, 0.640000

```
#ifndef _MATRIXMUL_KERNEL_H_
#define _MATRIXMUL_KERNEL_H_
#include <stdio.h>
#include "matrixMul.h"
////////////////////////////////////////////////////////////////////////////////
//! Matrix multiplication on the device: C = A * B
//! wA is A's width and wB is B's width
////////////////////////////////////////////////////////////////////////////////
__global__ void
matrixMul( float* Cp, float* Foil_X, float* Foil_Y, float* Panel_Length, float* Theta, float* Gamma, float* Field_X, float* Field_Y)
{
// Block index
int bx = blockIdx.x;
int by = blockIdx.y;
float PI = 3.14159265358979323846;
// Thread index
int tx = threadIdx.x;
int ty = threadIdx.y;
int p = 400;
int fp = 2911;
float AoA = 8*PI/180;
int i;
int j;
float A, B, C, D, E, F, G, P, Q, at_x, at_y, st, sn;
float cn2_x[2], cn1_x[2], ct2_x[2], ct1_x[2], cn2_y[2],cn1_y[2], ct2_y[2], ct1_y[2];
float smt=0, smn=0;
//__shared__ float sCp[BLOCK_SIZE];
for(i=tx;i<2911;i+=BLOCK_SIZE)
{
smt = 0;
smn = 0;
for(j=0;j<p;j++)
{
A = -(Field_X[i]-Foil_X[j])*cosf(Theta[j])-(Field_Y[1]-Foil_Y[j])*sinf(Theta[j]);
B = (Field_X[i]-Foil_X[j])*(Field_X[i]-Foil_X[j])+(Field_Y[i]-Foil_Y[j])*(Field_Y[i]-Foil_Y[j]);
C = sinf(0.-Theta[j]);
D = cosf(0.-Theta[j]);
E = (Field_X[i]-Foil_X[j])*sinf(Theta[j])-(Field_Y[i]-Foil_Y[j])*cosf(Theta[j]);
F = logf(1+(Panel_Length[j]*Panel_Length[j]+2.*A*Panel_Length[j])/B);
G = atan2f((E*Panel_Length[j]),(B+A*Panel_Length[j]));
P = (Field_X[i]-Foil_X[j])*sinf(0.-2.*Theta[j])+(Field_Y[i]-Foil_Y[j])*cosf(0.-2.*Theta[j]);
Q = (Field_X[i]-Foil_X[j])*cosf(0.-2.*Theta[j])-(Field_Y[i]-Foil_Y[j])*sinf(0.-2.*Theta[j]);
cn2_x[0] = D+0.5*Q*F/Panel_Length[j]-(A*C+D*E)*G/Panel_Length[j];
cn1_x[0] = 0.5*D*F+C*G-cn2_x[0];
ct2_x[0] = C+0.5*P*F/Panel_Length[j]+(A*D-C*E)*G/Panel_Length[j];
ct1_x[0] = 0.5*C*F-D*G-ct2_x[0];
C = sinf(PI/2.-Theta[j]);
D = cosf(PI/2.-Theta[j]);
P = (Field_X[i]-Foil_X[j])*sinf(PI/2.-2.*Theta[j])+(Field_Y[i]-Foil_Y[j])*cosf(PI/2.-2.*Theta[j]);
Q = (Field_X[i]-Foil_X[j])*cosf(PI/2.-2.*Theta[j])-(Field_Y[i]-Foil_Y[j])*sinf(PI/2.-2.*Theta[j]);
cn2_y[0] = D+0.5*Q*F/Panel_Length[j]-(A*C+D*E)*G/Panel_Length[j];
cn1_y[0] = 0.5*D*F+C*G-cn2_x[0];
ct2_y[0] = C+0.5*P*F/Panel_Length[j]+(A*D-C*E)*G/Panel_Length[j];
ct1_y[0] = 0.5*C*F-D*G-ct2_x[0];
if( j == 0)
{
at_x = 0.;
at_y = 0.;
}
else
{
at_x = ct1_x[0]+ct2_x[1];
at_y = ct1_y[0]+ct2_y[1];
}
st = at_x*Gamma[j];
sn = at_y*Gamma[j];
smt+=st;
smn+=sn;
cn2_x[1] = cn2_x[0];
cn1_x[1] = cn1_x[0];
ct2_x[1] = ct2_x[0];
ct1_x[1] = ct1_x[0];
cn2_y[1] = cn2_y[0];
cn1_y[1] = cn1_y[0];
ct2_y[1] = ct2_y[0];
ct1_y[1] = ct1_y[0];
}
//sCp[i]=1.-((cosf(0.-AoA)+smt)*(cosf(0.-AoA)+smt)+(cosf(PI/2.-AoA)+smn)*(cosf(PI/2.-AoA)+smn));
//Cp[i]=1.-((cosf(0.-AoA)+smt)*(cosf(0.-AoA)+smt)+(cosf(PI/2.-AoA)+smn)*(cosf(PI/2.-AoA)+smn));
}
//__syncthreads();
for(i=0;i<400;i++)
{
if(ty ==0 & bx==0)Cp[i]=Gamma[i];//sCp[0];
}
//__syncthreads();
}
#endif // #ifndef _MATRIXMUL_KERNEL_H_
```