Hello everyone, I have a problem with the Shared Memory, when I try to load the scene on the device in the shared memory, the compiler gives me the following error: Error: Unaligned memory accesses not supported
Why? where I wrong?
this is the method where I load the scene on the device:
[codebox]host void callCuda(Pixel *a_d, int w, int h, int ch, float x, float y, float z, float3 light, float intensita, float Kd, float Ks, float N, scene *hScene)
{
if(dvertex==NULL && dface==NULL && dfaceIndex==NULL && dnormal== NULL){
printf("Caricamento scena in memoria......");
//Converto hScene->vertex da double in float
hvertexTemp = (float*) malloc((hScene->vertexNum*4) * sizeof(float));
for(int i=0; i<(hScene->vertexNum*4); i++){
hvertexTemp[i]=hScene->vertex[i];
}
//Converto hScene->normal da double in float
hnormalTemp = (float*) malloc((hScene->normalNum*3) * sizeof(float));
for(int i=0; i<(hScene->normalNum*3); i++){
hnormalTemp[i]=hScene->normal[i];
}
//Alloco memoria su device
cutilSafeCall(cudaMalloc((void**) &dvertex, (hScene->vertexNum*4) * sizeof(float)));
cutilSafeCall(cudaMalloc((void**) &dface, (hScene->faceNum*10) * sizeof(int)));
cutilSafeCall(cudaMalloc((void**) &dfaceIndex, hScene->faceNum * sizeof(int)));
cutilSafeCall(cudaMalloc((void**) &dnormal, (hScene->normalNum*3) * sizeof(float)));
//Copio Dati dalla scena agli array su device
cutilSafeCall(cudaMemcpy(dvertex, hvertexTemp, (hScene->vertexNum*4) * sizeof(float),cudaMemcpyHostToDevice));
cutilSafeCall(cudaMemcpy(dface, hScene->face, (hScene->faceNum*10)* sizeof(int),cudaMemcpyHostToDevice));
cutilSafeCall(cudaMemcpy(dfaceIndex, hScene->faceIndex, hScene->faceNum * sizeof(int),cudaMemcpyHostToDevice));
cutilSafeCall(cudaMemcpy(dnormal, hnormalTemp, (hScene->normalNum*3) * sizeof(float),cudaMemcpyHostToDevice));
//Copio Dati dalla scena alle variabili su device
dfaceNum = hScene->faceNum;
dnormalNum = hScene->normalNum;
printf("FINE\n");
}
float3 Eye;
Eye.x =x;
Eye.y =y;
Eye.z =z;
float distanza=2.00;
float vFov=60.00;
float hFov=vFov*((float)w/(float)h);
vFov = vFov * M_PI / 180;
hFov = hFov * M_PI / 180;
float3 At;
At.x= Eye.x;
At.y= Eye.y;
At.z=Eye.z-distanza;
float3 vettoreCam = At-Eye;
float3 vettore_up = {0.0,1.0,0.0};
float3 vettore_v = cross(vettoreCam,vettore_up);
float3 ydir= -vettore_up;
float3 xdir= cross(ydir,vettoreCam);
xdir= normalize(xdir)*((2.0*distanza*tan(hFov/2.0))/(float)w);
ydir= normalize(ydir)*((2.0*distanza*tan(vFov/2.0))/(float)h);
float3 LLdir;
LLdir.x=vettoreCam.x-(xdir.x*w)/2;
LLdir.y=vettoreCam.y-(ydir.y*h)/2;
LLdir.z=vettoreCam.z;
dim3 dimGrid (80,60);
dim3 dimBlock (8,8);
DrawScreen <<< dimGrid, dimBlock, sizeof(float3)*6 >>> (a_d,w,h,ch,Eye,xdir,ydir,LLdir,light, intensita, Kd, Ks, N, dvertex, dface, dfaceIndex, dnormal, dfaceNum);
}[/codebox]
This is the kernel where I try to load the scene on Shared Memory:
[codebox]#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <cuda.h>
#include <cutil_math.h>
#include <cutil_inline.h>
#include <cuda_runtime.h>
#include “DrawScreen.h”
#include “geometry.h”
#define EPSILON 0.00001
#define M_PI 3.1415926535897932384626433832795
static float *hvertexTemp = NULL;
static float *hnormalTemp = NULL;
align(16) static float *dnormal = NULL;
align(16) static float *dvertex = NULL;
align(16) static int *dface = NULL;
align(16) static int *dfaceIndex = NULL;
align(16) static int dfaceNum = NULL;
align(16) static int dnormalNum = NULL;
extern shared char sharedArray;
global void DrawScreen(float *a_d, int w, int h, int ch,float3 eye, float3 xdir, float3 ydir, float3 LLdir, float3 light, float intensita, float Kd, float Ks, float N,
float *dvertex, int *dface, int *dfaceIndex, float *dnormal, int dfaceNum)
{
float3* vertex = (float3*)sharedArray;
float3* normal = (float3*)&vertex[3];
int limit =whch;
int tdx = (threadIdx.x+blockIdx.x*blockDim.x);
int tdy = (threadIdx.y+blockIdx.y*blockDim.y);
int idarray = (tdx+tdy*w)*ch;
float du, dv, dw, dz;
float tempDz=-1000;
float tempDu, tempDv;
float3 tempVista, tempLuce, tempNormal, tempPunto;
float3 dir= CalcolaVettorePunto(xdir, ydir, LLdir, tdx, tdy);
int inc=0, tempInc=0;
Pixel inte;
for (int fi=0; fi<dfaceNum; fi++){
if(tdx==0 && tdy==0){
vertex[0].x = dvertex[((dface[dfaceIndex[fi]+1]-1)*4)]; //--> <b>Error: Unaligned memory accesses not supported</b>
vertex[0].y = dvertex[((dface[dfaceIndex[fi]+1]-1)*4)+1];
vertex[0].z = dvertex[((dface[dfaceIndex[fi]+1]-1)*4)+2];
vertex[1].x = dvertex[((dface[dfaceIndex[fi]+4]-1)*4)];
vertex[1].y = dvertex[((dface[dfaceIndex[fi]+4]-1)*4)+1];
vertex[1].z = dvertex[((dface[dfaceIndex[fi]+4]-1)*4)+2];
vertex[2].x = dvertex[((dface[dfaceIndex[fi]+7]-1)*4)];
vertex[2].y = dvertex[((dface[dfaceIndex[fi]+7]-1)*4)+1];
vertex[2].z = dvertex[((dface[dfaceIndex[fi]+7]-1)*4)+2];
normal[0].x = dnormal[((dface[dfaceIndex[fi]+3]-1)*3)];
normal[0].y = dnormal[((dface[dfaceIndex[fi]+3]-1)*3)+1];
normal[0].z = dnormal[((dface[dfaceIndex[fi]+3]-1)*3)+2];
normal[1].x = dnormal[((dface[dfaceIndex[fi]+6]-1)*3)];
normal[1].y = dnormal[((dface[dfaceIndex[fi]+6]-1)*3)+1];
normal[1].z = dnormal[((dface[dfaceIndex[fi]+6]-1)*3)+2];
normal[2].x = dnormal[((dface[dfaceIndex[fi]+9]-1)*3)];
normal[2].y = dnormal[((dface[dfaceIndex[fi]+9]-1)*3)+1];
normal[2].z = dnormal[((dface[dfaceIndex[fi]+9]-1)*3)+2];
}
__syncthreads();
...
}
}[/codebox]