cuda passing user defined structure to a kernel failed

My problem is posted in this link http://stackoverflow.com/questions/28090950/cuda-passing-user-defined-structure-to-a-kernel-failed.
My architecture is: “GeForce 210”, CUDA 5.0, with a cuda capabiliy Major/Minor version 1.2

Here is my compiling code below, in a “kernel.cu” file:

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "cuPrintf.cu"

#include <stdio.h>

#include <stdlib.h>
#include <stdio.h>

typedef struct
{
    float a, b;
} point;

struct   __Q_VECTOR__{
    double* Data;       
    int     Dimension;  
    int     Cluster;    
};

typedef struct __Q_VECTOR__     VQ_VECTOR;

__global__ void testKernel(VQ_VECTOR *X, int N){
    int i= blockIdx.x*blockDim.x + threadIdx.x;
    cuPrintf("\n testKernel entrance by the global threadIdx= %d\n", i);
    for(int k=0; k<X[i].Dimension; k++)
        cuPrintf("%.3f, ",X[i].Data[k]);
    cuPrintf("\n");
}

int main(void){
    int L = 3, //.Data length
        N = 100;

    VQ_VECTOR   *A,
                *device_VQ_VECTOR;

    cudaError_t cudaStatus;

    A =   (VQ_VECTOR*)malloc(N*sizeof(VQ_VECTOR));
    for(int i=0; i<N; i++){
        VQ_VECTOR a;
        a.Data = (double*)malloc(L*sizeof(double));;
        a.Cluster   =   1;
        a.Dimension =   L;
        for(int j=0; j<L; j++)
            a.Data[j]=(1+i)*(1+j);

        A[i] = a;
    }

    //Prinf of all the elements of A
    for(int i=0; i<2; i++){
        printf("\nA[%d]={", i);
        for(int j=0; j<L; j++)
            printf("%.3f",A[i].Data[j]);
        printf("}\n");
    }

    printf("\n\n");
    //I Allocate and Copy data from A to device_VQ_VECTORon the GPU memory

    cudaDeviceReset();
    cudaStatus = cudaMalloc((void**)&device_VQ_VECTOR, N*sizeof(VQ_VECTOR));
    cudaStatus = cudaMemcpy(device_VQ_VECTOR, A, N*sizeof(VQ_VECTOR), cudaMemcpyHostToDevice);

    for(int i = 0; i != N; ++i) {
        /* can't access device_VQ_VECTOR[i].Data directly from host-side,
         * working around it with proxy variable */
        double *out;
        cudaMalloc(&out, L*sizeof(double));
        cudaMemcpy(out, A[i].Data, L*sizeof(double),
                cudaMemcpyHostToDevice);
        cudaMemcpy(&device_VQ_VECTOR[i].Data, &out, sizeof(void*),
                cudaMemcpyHostToDevice);

        // will re-allocate later, for simplicity sake
        free(A[i].Data);
    }

    cudaPrintfInit();
    testKernel<<<N,1>>>(device_VQ_VECTOR, N);//to test and see on a sigle thread
    cudaPrintfDisplay(stdout, true);
    cudaPrintfEnd();
    cudaStatus = cudaGetLastError();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "\n testKernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
        return 1;
    }
    cudaStatus = cudaMemcpy(A, device_VQ_VECTOR, N*sizeof(VQ_VECTOR), cudaMemcpyDeviceToHost);
    for(int i = 0; i != N; ++i) {
        // allocate array, copy data
        double *array = (double*)malloc(L*sizeof(double));
        cudaMemcpy(array, A[i].Data, L*sizeof(double),
                cudaMemcpyDeviceToHost);

        // assign new array to A[i]
        A[i].Data = array;
    }
    cudaStatus = cudaGetLastError();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "\n testKernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
        return 1;
    }
	  for(int i=0; i<2; i++){
        printf("\nA[%d]={", i);
        for(int j=0; j<L; j++)
            printf("%.3f, ",A[i].Data[j]);
        printf("}\n");
    }
    cudaFree(device_VQ_VECTOR);

    // don't forget to free A and all its Data

    return 0;
}

The output are not what I expected, beacause I have results like (it is huge, I don’t want to post too much too):

[1, 0]: 0.000, [1, 0]: 0.000, [1, 0]: 0.000, [1, 0]:
[2, 0]:
 testKernel entrance by the global threadIdx= 2
[2, 0]: 0.000, [2, 0]: 0.000, [2, 0]: 0.000, [2, 0]:
[3, 0]:
 testKernel entrance by the global threadIdx= 3
[3, 0]: 0.000, [3, 0]: 0.000, [3, 0]: 0.000, [3, 0]:
[4, 0]:
 testKernel entrance by the global threadIdx= 4
[4, 0]: 0.000, [4, 0]: 0.000, [4, 0]: 0.000, [4, 0]:
[5, 0]:
 testKernel entrance by the global threadIdx= 5
[5, 0]: 0.000, [5, 0]: 0.000, [5, 0]: 0.000, [5, 0]:
[6, 0]:

I don’t know what’s going wrong, because others don’t have these bad results with only zeroes. Is it a problem with my device architecture? What can I do to solve this problem?

something like this may work for the host, not necessarily the device:

sizeof(VQ_VECTOR)

cudaStatus = cudaMalloc((void**)&device_VQ_VECTOR, N*sizeof(VQ_VECTOR));

if the compiler super-pads your structure (sizeof() returns wrong size), you are in trouble, and if the compiler does not super-pad your structure, you are in trouble (misaligned, according to the device)

probably need __align(8) or so for it to work

and you need to ask yourself whether using such a structure on the device, like you are doing, is truly efficient
for one, you would likely have stridden memory accesses, by the looks of it

for(int k=0; k<X[i].Dimension; k++)
cuPrintf("%.3f, ",X[i].Data[k]);

Jimmy, I mention that my code works fine for someone else, but not on my architecture. If i want to use the __align(8) or so for it to work, can you suggest exactly a code where you specify it ?
This is just a simple example, the efficiency i think can be true if I have a fixed dimension of Data, and a wide range of VQ_VECTOR array size

Output are correct for others. my gpu architecture (because I’m on VS 2008 with CUDA5.0), is sm_10. Is it related toot to my problem?

i think it is more related to your platform (os, etc) than your architecture

the ‘someone else’ likely has greater architecture AND consequently greater os/ VS version

for devices, alignment is rather a big thing, and i have bumped my head sufficiently now with little to show for it afterwards, such that i decided to give up passing structures, in favour of flatter arrays or universal (and natural) typed structures

search the programming guide for ‘alignment’; you will note 2 key sections discussing alignment - around page 80 and 200
with that you will find align(8), __alignof(), and also a sample code to align a mixed type structure (around page 200)

also search the programming guide for ‘windows’ and you will note the alignment may also differ between cuda and the ms compiler