The mistake must be somewhere in code you are not showing. When I hack together this:

```
#include <iostream>
#include <fstream>
using namespace std;
#ifndef gpuAssert
#include <stdio.h>
#define gpuAssert( condition ) { if( (condition) != 0 ) { fprintf( stderr, "\n FAILURE %s in %s, line %d\n", cudaGetErrorString(condition), __FILE__, __LINE__ ); exit( 1 ); } }
#endif
#ifndef gpuCheck
#define gpuCheck { gpuAssert( cudaGetLastError() ) }
#endif
__device__ float dot( const float4 v, const float4 w )
{
return (v.x*w.x) + (v.y*w.y) + (v.z*w.z) + (v.w*w.w);
}
__global__ void grNormalize(const unsigned size, float4 *gr)
{
unsigned idx = blockDim.x * blockIdx.x + threadIdx.x;
if (idx>=size) return; //over size
float4 vec=gr[idx];
vec.w = sqrtf(dot(vec, vec));
vec.x/=vec.w;
vec.y/=vec.w;
vec.z/=vec.w;
gr[idx]=vec;
}
void entryPoint(unsigned xsize, unsigned ysize, unsigned zsize, float *gv)
{
unsigned size=xsize*ysize*zsize;
unsigned byteSize=size*sizeof(float);
unsigned iGr=0, iGv=0;
float *gr=(float *)malloc(byteSize*(3+1));//4 floats, 3 for vector field and 1 for scalar field
for (unsigned z=0; z<zsize; z++)
for (unsigned y=0; y<ysize; y++)
for (unsigned x=0; x<xsize; x++)
{
gr[iGr++]=gv[iGv++]; //x
gr[iGr++]=gv[iGv++]; //y
gr[iGr++]=gv[iGv++]; //z
gr[iGr++]=0; //w
}
float4 *gpuGr;
gpuAssert( cudaMalloc(&gpuGr, size*sizeof(float4)) );
gpuAssert( cudaMemcpy(gpuGr, gr, size*sizeof(float4), cudaMemcpyHostToDevice) );
grNormalize<<< ceil(size/256.0), 256 >>>(size, gpuGr); gpuCheck;
gpuAssert( cudaMemcpy(gr, gpuGr, size*sizeof(float4), cudaMemcpyDeviceToHost ) );
ofstream f("gr.txt");
for (unsigned i=0; i<32; i+=4)
f<<"("<<gr[i]<<","
<<gr[i+1]<<","
<<gr[i+2]<<") "
<<gr[i+3]<<"\n";
f.close(); //human output this time
}
int main(void)
{
const unsigned int xs = 128, ys = 128, zs = 128;
const unsigned int sz = 3 * xs * ys * zs;
float * v = (float *)malloc( size_t(sz) * sizeof(float) );
for(unsigned int i=0; i< sz; i++) {
v[i] = (float)i / 1.e6f;
}
entryPoint(xs, ys, zs, v);
gpuAssert(cudaThreadExit());
}
```

It compiles and runs like this:

```
avidday@cuda:~$ nvcc -arch=sm_20 kuusipÃ¤Ã¤.cu -o kuusipÃ¤Ã¤
avidday@cuda:~$ ./kuusipÃ¤Ã¤
avidday@cuda:~$ cat gr.txt
(0,0.447214,0.894427) 2.23607e-06
(0.424264,0.565685,0.707107) 7.07107e-06
(0.491539,0.573462,0.655386) 1.22066e-05
(0.517892,0.575435,0.632979) 1.73781e-05
(0.531891,0.576215,0.620539) 2.2561e-05
(0.540563,0.5766,0.612638) 2.77489e-05
(0.546459,0.576818,0.607177) 3.29393e-05
(0.550728,0.576953,0.603178) 3.81314e-05
```

and those results look about right to my untrained eye.