Computing a 3D volume best practices convert 1D stream index to 3D index?

I am writing a 3D volume finite element code that performs the same computatoin on lots of voxels. Is it best to just take the 3D model space and just pack it into a 1D stream? That is what I am thinking, but I am not really an expert. Just keep the streams 1D and then have functions that do the logical 3D translations. I have included some code that can do some of that. Does this technique see like the best way to do this, are there some tweeks to be made, or is there a better faster method all to gether?

/*

 * howto_streams.cu

 *

 * author      : Sam Adams

 * email       : samuel.adams@brooks.af.mil

 * date        : 20080108 - added mappings of a 1D stream to a 3D logical space

 *               20071218 - first version

 * discription : This is an attempt to figure out how to best make streams and kernels for processesing on the GPU.

 *               In this version we take the address from a 1D steam and convert it to logical 3D address

 */

#include <stdio.h>

#include <stdlib.h>

#include <string.h>

#include <time.h>

#include <cuda_runtime_api.h>

#define BLOCK_SIZE 256

__device__ int3 *devMem = NULL;

int3 dim;

__device__ __host__ int3 _1d_to_3d(int idx, int3 dim){

        int3 index;

       index.z = idx / (dim.x * dim.y);

        index.y = (idx - index.z * dim.x * dim.y) / dim.y;

        index.x = (idx - index.z * dim.x * dim.y - index.y * dim.y);

       return index;

}

__device__ __host__ int _3d_to_1d(int3 index, int3 dim){

        return index.x + index.y * dim.y + index.z * dim.x * dim.y;

}

__device__ __host__ int plus_1x(int idx, int3 dim){

        return idx + 1;

}

__device__ __host__ int plus_1y(int idx, int3 dim){

        return idx + dim.y;

}

__device__ __host__ int plus_1z(int idx, int3 dim){

        return idx + dim.x * dim.y;

}

__global__ void convertAddress(int3 *address, int3 dim){

        int idx = blockDim.x * blockIdx.x + threadIdx.x;

        int3 index;

       index = _1d_to_3d(idx, dim);

        address[idx].x = index.x;

        address[idx].y = index.y;

        address[idx].z = index.z;

}

int main(int argc, char **argv){

        int i;

        int3 *result = NULL;

        int3 tmpIndex_3d;

        int tmpIndex_1d;

        int n_blocks = 1 + dim.x*dim.y*dim.z/256;

       if(n_blocks*256 < dim.x*dim.y*dim.z) n_blocks++;

       dim.x = 4;

        dim.y = 4;

        dim.z = 4;

       printf("the model size is (%i,%i,%i) or %i cells\n",dim.x,dim.y,dim.z,dim.x*dim.y*dim.z);

        printf("the kernel execution will be <<<%i,%i>>>\n",n_blocks,BLOCK_SIZE);

        result = (int3*)calloc(sizeof(int3), dim.x * dim.y * dim.z);

        cudaMalloc((void**)&devMem, dim.x * dim.y * dim.z * sizeof(int3));

        convertAddress<<<n_blocks, BLOCK_SIZE>>>(devMem, dim);

        cudaMemcpy(result, devMem, dim.x * dim.y * dim.z * sizeof(int3), cudaMemcpyDeviceToHost);

       for(i = 0; i < dim.x * dim.y * dim.z; i++){

                printf("%i -> (%i,%i,%i)\n",i,result[i].x,result[i].y,result[i].z);

        }

        tmpIndex_3d.x = 2;

        tmpIndex_3d.y = 2;

        tmpIndex_3d.z = 2;

        tmpIndex_1d = _3d_to_1d(tmpIndex_3d,dim);

       printf("address (%i,%i,%i) to 1d %i\n",tmpIndex_3d.x,tmpIndex_3d.y,tmpIndex_3d.z,tmpIndex_1d);

        tmpIndex_3d.x++;

        printf("\t+1 x (%i,%i,%i) to 1d %i",tmpIndex_3d.x,tmpIndex_3d.y,tmpIndex_3d.z,_3d_to_1d(tmpIndex_3d,dim));

        tmpIndex_3d.x--;

        printf(" == %i\n",plus_1x(_3d_to_1d(tmpIndex_3d,dim),dim));

        tmpIndex_3d.y++;

        printf("\t+1 y (%i,%i,%i) to 1d %i",tmpIndex_3d.x,tmpIndex_3d.y,tmpIndex_3d.z,_3d_to_1d(tmpIndex_3d,dim));

        tmpIndex_3d.y--;

        printf(" == %i\n",plus_1y(_3d_to_1d(tmpIndex_3d,dim),dim));

        tmpIndex_3d.z++;

        printf("\t+1 y (%i,%i,%i) to 1d %i",tmpIndex_3d.x,tmpIndex_3d.y,tmpIndex_3d.z,_3d_to_1d(tmpIndex_3d,dim));

        tmpIndex_3d.z--;

        printf(" == %i\n",plus_1z(_3d_to_1d(tmpIndex_3d,dim),dim));

       return 0;

}