Shared Memory error Error: Unaligned memory accesses not supported

Hello everyone, I have a problem with the Shared Memory, when I try to load the scene on the device in the shared memory, the compiler gives me the following error: Error: Unaligned memory accesses not supported

Why? where I wrong?

this is the method where I load the scene on the device:

[codebox]host void callCuda(Pixel *a_d, int w, int h, int ch, float x, float y, float z, float3 light, float intensita, float Kd, float Ks, float N, scene *hScene)

{

if(dvertex==NULL && dface==NULL && dfaceIndex==NULL && dnormal== NULL){



		printf("Caricamento scena in memoria......");

		//Converto hScene->vertex da double in float

		hvertexTemp = (float*) malloc((hScene->vertexNum*4) * sizeof(float));

		for(int i=0; i<(hScene->vertexNum*4); i++){

			hvertexTemp[i]=hScene->vertex[i];

		}

		

		//Converto hScene->normal da double in float

		hnormalTemp = (float*) malloc((hScene->normalNum*3) * sizeof(float));

		for(int i=0; i<(hScene->normalNum*3); i++){

			hnormalTemp[i]=hScene->normal[i];

		}

		

		//Alloco memoria su device

		cutilSafeCall(cudaMalloc((void**) &dvertex, (hScene->vertexNum*4) * sizeof(float)));

		cutilSafeCall(cudaMalloc((void**) &dface, (hScene->faceNum*10) * sizeof(int)));

		cutilSafeCall(cudaMalloc((void**) &dfaceIndex, hScene->faceNum * sizeof(int)));

		cutilSafeCall(cudaMalloc((void**) &dnormal, (hScene->normalNum*3) * sizeof(float)));

		

		//Copio Dati dalla scena agli array su device

		cutilSafeCall(cudaMemcpy(dvertex, hvertexTemp, (hScene->vertexNum*4) * sizeof(float),cudaMemcpyHostToDevice));    

		cutilSafeCall(cudaMemcpy(dface, hScene->face, (hScene->faceNum*10)* sizeof(int),cudaMemcpyHostToDevice));    

		cutilSafeCall(cudaMemcpy(dfaceIndex, hScene->faceIndex, hScene->faceNum * sizeof(int),cudaMemcpyHostToDevice));

		cutilSafeCall(cudaMemcpy(dnormal, hnormalTemp, (hScene->normalNum*3) * sizeof(float),cudaMemcpyHostToDevice)); 

		

		//Copio Dati dalla scena alle variabili su device

		dfaceNum = hScene->faceNum;

		dnormalNum = hScene->normalNum;

		printf("FINE\n");

	}

float3 Eye;

Eye.x =x;

Eye.y =y;

Eye.z =z;



float distanza=2.00;

float vFov=60.00;

float hFov=vFov*((float)w/(float)h);



vFov = vFov * M_PI / 180;

hFov = hFov * M_PI / 180;



float3 At;

At.x= Eye.x;

At.y= Eye.y;

At.z=Eye.z-distanza;

float3 vettoreCam = At-Eye;

float3 vettore_up = {0.0,1.0,0.0};

float3 vettore_v = cross(vettoreCam,vettore_up);



float3 ydir= -vettore_up;

float3 xdir= cross(ydir,vettoreCam);



xdir= normalize(xdir)*((2.0*distanza*tan(hFov/2.0))/(float)w);

ydir= normalize(ydir)*((2.0*distanza*tan(vFov/2.0))/(float)h);



float3 LLdir;

LLdir.x=vettoreCam.x-(xdir.x*w)/2;

LLdir.y=vettoreCam.y-(ydir.y*h)/2;

LLdir.z=vettoreCam.z;



dim3 dimGrid (80,60);

dim3 dimBlock (8,8);

DrawScreen <<< dimGrid, dimBlock, sizeof(float3)*6 >>> (a_d,w,h,ch,Eye,xdir,ydir,LLdir,light, intensita, Kd, Ks, N, dvertex, dface, dfaceIndex, dnormal, dfaceNum);

}[/codebox]

This is the kernel where I try to load the scene on Shared Memory:

[codebox]#include <stdlib.h>

#include <stdio.h>

#include <string.h>

#include <cuda.h>

#include <cutil_math.h>

#include <cutil_inline.h>

#include <cuda_runtime.h>

#include “DrawScreen.h”

#include “geometry.h”

#define EPSILON 0.00001

#define M_PI 3.1415926535897932384626433832795

static float *hvertexTemp = NULL;

static float *hnormalTemp = NULL;

align(16) static float *dnormal = NULL;

align(16) static float *dvertex = NULL;

align(16) static int *dface = NULL;

align(16) static int *dfaceIndex = NULL;

align(16) static int dfaceNum = NULL;

align(16) static int dnormalNum = NULL;

extern shared char sharedArray;

global void DrawScreen(float *a_d, int w, int h, int ch,float3 eye, float3 xdir, float3 ydir, float3 LLdir, float3 light, float intensita, float Kd, float Ks, float N,

					   float *dvertex, int *dface, int *dfaceIndex, float *dnormal, int dfaceNum)

{

float3* vertex = (float3*)sharedArray;

float3* normal = (float3*)&vertex[3];

int limit =whch;

int tdx = (threadIdx.x+blockIdx.x*blockDim.x);

int tdy = (threadIdx.y+blockIdx.y*blockDim.y);

int idarray = (tdx+tdy*w)*ch;

float du, dv, dw, dz;

float tempDz=-1000;

float tempDu, tempDv;

float3 tempVista, tempLuce, tempNormal, tempPunto;



float3 dir= CalcolaVettorePunto(xdir, ydir, LLdir, tdx, tdy);

int inc=0, tempInc=0;

Pixel inte;

for (int fi=0; fi<dfaceNum; fi++){

	if(tdx==0 && tdy==0){

	vertex[0].x = dvertex[((dface[dfaceIndex[fi]+1]-1)*4)];  //--> <b>Error: Unaligned memory accesses not supported</b>

	vertex[0].y = dvertex[((dface[dfaceIndex[fi]+1]-1)*4)+1];

	vertex[0].z = dvertex[((dface[dfaceIndex[fi]+1]-1)*4)+2];

	vertex[1].x = dvertex[((dface[dfaceIndex[fi]+4]-1)*4)];

	vertex[1].y = dvertex[((dface[dfaceIndex[fi]+4]-1)*4)+1];

	vertex[1].z = dvertex[((dface[dfaceIndex[fi]+4]-1)*4)+2];

	vertex[2].x = dvertex[((dface[dfaceIndex[fi]+7]-1)*4)];

	vertex[2].y = dvertex[((dface[dfaceIndex[fi]+7]-1)*4)+1];

	vertex[2].z = dvertex[((dface[dfaceIndex[fi]+7]-1)*4)+2];

	normal[0].x = dnormal[((dface[dfaceIndex[fi]+3]-1)*3)];

	normal[0].y = dnormal[((dface[dfaceIndex[fi]+3]-1)*3)+1];

	normal[0].z = dnormal[((dface[dfaceIndex[fi]+3]-1)*3)+2];

	normal[1].x = dnormal[((dface[dfaceIndex[fi]+6]-1)*3)];

	normal[1].y = dnormal[((dface[dfaceIndex[fi]+6]-1)*3)+1];

	normal[1].z = dnormal[((dface[dfaceIndex[fi]+6]-1)*3)+2];

	normal[2].x = dnormal[((dface[dfaceIndex[fi]+9]-1)*3)];

	normal[2].y = dnormal[((dface[dfaceIndex[fi]+9]-1)*3)+1];

	normal[2].z = dnormal[((dface[dfaceIndex[fi]+9]-1)*3)+2];

	}

	

	__syncthreads();

	...

}

}[/codebox]