help on kmeans passing of 2D array to gpu

hii iam trying to write a kernel for kmeans can any one help me out,

i am confused regarding passing the array of structures to gpu

typedef struct {
double p[2];
double distance[500];
}Point;

Point all_data[1000];

this is how i am passing the array of structures
for( unsigned int i = 0; i < Psize; i++)
for( unsigned int j = 0; j < NA; j++)
{
h_idata[i][j]=all_data[i].p[j];//host data points
}

for( unsigned int i = 0; i < K; i++)
       for( unsigned int j = 0; j < NA; j++)
      {
                  h_idata2[i][j]=pop.clu_cent[i].p[j];//cluster centres
          }  

double* d_idata;
double* d_idata2;

CUDA_SAFE_CALL( cudaMalloc( (void**) &d_idata, m1));//allocating memory

CUDA_SAFE_CALL( cudaMemcpy( d_idata, h_idata, m1,cudaMemcpyHostToDevice) );//points copying

CUDA_SAFE_CALL( cudaMalloc( (void**) &d_data2, m2));//allocating memory

CUDA_SAFE_CALL( cudaMemcpy( d_idata2, h_idata2, m2,cudaMemcpyHostToDevice) );//clusters copying

float *d_odata;//output data declaration

CUDA_SAFE_CALL( cudaMalloc( (void**) &d_odata, m3));

can any one help me out. on this issue

#ifdef _WIN32

define NOMINMAX

#endif
#define NA 2 /* 数据维数 /
#define K 500 /
聚类数 /
#define Psize 300000 /
种群大小 /
#define T 1000000 /
最大迭代数 /
#define ED 0.00000001 /
结束条件 */
// declaration, forward
// includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <float.h>

// includes, project
#include <cutil.h>

// includes, kernels
#include <kmeans_kernel.cu>

typedef struct {
double p[NA];
double distance[K];
}Point;

typedef struct {
Point clu_cent[K]; /* 即cluster_center 簇类中心 /
int cluster[K][Psize]; /
簇类数组 /
int cluster_num[K]; /
簇类中一组数据的编号 /
double fitness; /
æ ·æœ¬é€‚åº”åº¦å€¼,用于判断结束条件 /
double old_fitness; /
前一次迭代的适应度值 /
double Je; dim3 dimgrid((Psize - (Psize % 512)) / 512 + 1, K, 1);
dim3 dimblock( 512, 1, 1); /
æ‰€æœ‰æ ·æœ¬çš„å¹³æ–¹è¯¯å·®å’Œ /
}Pop;
// functions
int Is_equal(int a[], int n, int b);
double Euclid(int x, int y);
void input_data();
void Init_center();
void calculate_distance();
void Make_new_cluster();
void Make_new_center();
void output_info(int flag);
void runTest( int argc, char
* argv);

// array of structures
Point all_data[Psize];
Pop pop;

//input function
void input_data()
{
FILE *infile;
int i, j;
double data;

if((infile = fopen("test1.data", "r")) == NULL){
	printf("file not found\n");
	exit(1);
}
for(i = 0; i < Psize; i++)
	for(j = 0; j < NA; j++){
		fscanf(infile, "%lf", &data);
		all_data[i].p[j] = data;
	}
fclose(infile);		

}
// intitialsatio of clusters
void Init_center()
{
int i, j;
int num = 0;
int rand_num;
int rand_num_tmp[K];
/* 随机产生三个0~Psize的数 /
while(num < K){
rand_num = rand() % Psize;
if(!Is_equal(rand_num_tmp, num, rand_num))
rand_num_tmp[num++] = rand_num;
}
for(i = 0; i < K; i++){
// printf(“初始化质心%d:\n”, i + 1);
for(j = 0; j < NA; j++){
pop.clu_cent[i].p[j] = all_data[rand_num_tmp[i]].p[j];
// printf("%lf “,pop.clu_cent[i].p[j]);
}
printf(”\n");
}
}
// some function
int Is_equal(int a[], int n, int b)
{
int i;
for(i = 0; i < n; i++)
if(a[i] == b) return 1;
return 0;
}
/
*******************************************

  • calculated Psize set of data to a center of mass of K * Euclidean distance
    *******************************************/
    void calculate_distance()
    {
    int i, j;
    for(i = 0; i < Psize; i++)
    for(j = 0; j < K; j++){
    all_data[i].distance[j] = Euclid(i, j);
    // printf("%d—%d— %lf \n", i, j, all_data[i].distance[j]);
    }
    }
    ///////eucludian distance calculation
    double Euclid(int x, int y)
    {
    int i;
    for(i = 0; i < NA; i++)
    {
    distance += pow((all_data.p[i] - pop.clu_cent[y].p[i]), 2);
    }
    distance = sqrt(distance);
    return distance;
    }
    void Make_new_cluster()
    {
    int i, j;

    double min;

    for(i = 0; i < K; i++) /* 初始化编号 /
    pop.cluster_num[i] = 0;
    for(i = 0; i < Psize; i++){
    int index = 0;
    min = all_data[i].distance[0];
    for(j = 1; j < K; j++){ /
    ç­›é€‰åˆ°ç°‡å¿ƒæ¬§å‡ é‡Œå¾·æœ€å°çš„ /
    if(all_data[i].distance[j] < min){
    min = all_data[i].distance[j];
    index = j;
    }
    }
    /
    划分簇集 /
    pop.cluster[index][pop.cluster_num[index]++] = i;
    }
    /
    è®¡ç®—æ‰€æœ‰æ ·æœ¬çš„å¹³æ–¹è¯¯å·®å’Œ /
    pop.Je = 0;
    for(i = 0; i < K; i++)
    for(j = 0; j < pop.cluster_num[i]; j++){
    /
    æ ·æœ¬åˆ°ç°‡å¿ƒçš„å€¼å³ä¸ºå…¶æ¬§å‡ é‡Œå¾·è·ç¦» /
    pop.Je +=pow(all_data[pop.cluster[i][j]].distance[i],2);
    }
    pop.old_fitness = pop.fitness; /
    前一次迭代适应度值 /
    // printf(“old_fitness = %lf\n”, pop.old_fitness);
    pop.fitness = pop.Je; /
    æ‰€æœ‰æ ·æœ¬å¹³æ–¹è¯¯å·®å’Œå³ä¸ºé€‚åº”åº¦å€¼ */
    }
    void Make_new_center()
    {
    int i, j, n;
    double tmp_sum;

    for(i = 0; i < K; i++)
    for(j = 0; j < NA; j++){
    tmp_sum = 0;
    for(n = 0; n < pop.cluster_num[i]; n++){
    /* 第i个簇的第j维数的所有数据和 /
    tmp_sum += all_data[pop.cluster[i][n]].p[j];
    }
    /
    取平均数得到新的簇中心 /nput_data(); / 导入数据 /
    float ctimer1= clock();
    Init_center(); /
    初始化质心 */
    float ctimer2= clock();
    float ctimer7;
    pop.clu_cent[i].p[j] = tmp_sum / pop.cluster_num[i];
    }
    }

void output_info(int flag)
{
int i, j, n;

for(i = 0; i < K; i++){	
	if(flag == 0){
		printf("初始化质心%d:\n", i + 1);
		for(n = 0; n < NA; n++)
			printf("%lf	",pop.clu_cent[i].p[n]);
	} else if(flag == 1){ 
		printf("最终质心%d:\n", i + 1);
		for(n = 0; n < NA; n++)
			printf("%lf	",pop.clu_cent[i].p[n]);
	}
	printf("\n簇类%d:\n", i + 1);
	for(j = 0; j < pop.cluster_num[i]; j++){			
		for(n = 0; n < NA; n++){
				printf("%lf	", 
				all_data[pop.cluster[i][j]].p[n]);
		}
		printf("\n");
	}
}

}

////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////

/********************************

  •  主函数		*
    

******************************/
main()
{
float start = clock();
float ctimer3;
float ctimer4;
float ctimer5;
int i;
double differ = 1; /
适应度差值 /
int flag = 0; /
ç”¨æ¥æ ‡ç¤ºæ˜¯æ˜¾ç¤ºåˆå§‹æ•°æ®è¿˜æ˜¯èšç±»åŽçš„æ•°æ® /
input_data(); /
导入数据 /
float ctimer1= clock();
Init_center(); /
初始化质心 /
float ctimer2= clock();
float ctimer7;
int chunk = CHUNKSIZE;
//int tid = omp_get_thread_num();
//int num_threads = 16;
/
for(int a=0;a<Psize;a++)
for(int b=0; b<NA ;b++)
double Pcopy[a][bForum]=all_data[a].p[b];
for(int c=0;c<Psize;c++)
for(int d=0; d<NA ;d++)
double dcopy[c][d]=all_data[a].distance[b]; /
//#pragma omp for schedule (static, chunk)
for(i = 0; (i < T) && (differ > ED); i++)
{
float ctimer7= clock();
calculate_distance(); /
è®¡ç®—æ¬§å‡ é‡Œå¾·è·ç¦» */

	ctimer3= clock();
	Make_new_cluster();		/* 产生新的聚类 */
	
	ctimer4 = clock();
	if(flag == 0)
            {
		output_info(flag);	/* 输出第一次聚类后的结果 */
		flag = 1;	
	}
	Make_new_center();		/* 对新的聚类产生新的质心 */
	
	ctimer5= clock();		
	differ = pop.old_fitness - pop.fitness;	/* 判断条件 */
	differ = fabs(differ);
      }

printf("+++++++++++++++++++++++++++++++++++++++++++++++++++++++\n");
float ctimer6; 
    ctimer6 = clock();
output_info(flag);	/* 聚类后显示结果 */
    float end= clock();
    printf("%f time taken in  seconds for input_data\n",(ctimer1-start)/CLOCKS_PER_SEC);
    printf("%f time taken in  seconds for init centre\n",(ctimer2-ctimer1)/CLOCKS_PER_SEC);
    printf("%f time taken in  seconds for calculate distance\n",(ctimer3-ctimer7)/CLOCKS_PER_SEC);
    printf("%f time taken in  seconds for make new cluster\n",(ctimer4-ctimer3)/CLOCKS_PER_SEC);
    printf("%f time taken in  seconds for make new center\n",(ctimer5-ctimer4)/CLOCKS_PER_SEC);
printf("%f time taken in  seconds for output_info\n",(-ctimer6+end)/CLOCKS_PER_SEC);
    printf("%f time taken in  seconds for the for loop\n",(ctimer6-ctimer2)/CLOCKS_PER_SEC);
printf("%f time taken in  seconds for total program\n",(end-start)/CLOCKS_PER_SEC);
return 0;

}

void runTest( int argc, char** argv)
{

CUT_DEVICE_INIT();

unsigned int timer = 0;
CUT_SAFE_CALL( cutCreateTimer( &timer));
CUT_SAFE_CALL( cutStartTimer( timer));

int m1 = sizeof( float)(Psize * NA);//simplified memory access
int m2 = sizeof( float)
(K * NA);//simplified memory access
int m3 = sizeof( float)*(Psize * K);//simplified memory access

float* h_idata = (float*) malloc( m1);
float* h_idata2 = (float*) malloc( m2);

for( unsigned int i = 0; i < Psize; i++)
       for( unsigned int j = 0; j < NA; j++)
     {
                  h_idata[i][j]=all_data[i].p[j];//host data points
         }
      
for( unsigned int i = 0; i < K; i++)
       for( unsigned int j = 0; j < NA; j++)
      {
                  h_idata2[i][j]=pop.clu_cent[i].p[j];//cluster centres
          }  

double* d_idata;
double* d_idata2;

CUDA_SAFE_CALL( cudaMalloc( (void**) &d_idata, m1));//allocating memory

CUDA_SAFE_CALL( cudaMemcpy( d_idata, h_idata, m1,cudaMemcpyHostToDevice) );//points copying

CUDA_SAFE_CALL( cudaMalloc( (void**) &d_data2, m2));//allocating memory

CUDA_SAFE_CALL( cudaMemcpy( d_idata2, h_idata2, m2,cudaMemcpyHostToDevice) );//clusters copying

float *d_odata;//output data declaration

CUDA_SAFE_CALL( cudaMalloc( (void**) &d_odata, m3));
/*
 * how to declare number of threads and blocks lets dicuss
*/
kmeans_kernel<<< num_threads,num_blocks>>>( d_idata, d_idata2, d_odata,Psize,K,NA);//kernel declaration

CUT_CHECK_ERROR("Kernel execution failed");

float* h_odata = (float*) malloc( m3);

CUDA_SAFE_CALL( cudaMemcpy( h_odata, d_odata, m3, cudaMemcpyDeviceToHost) );//copying back memory

CUT_SAFE_CALL( cutStopTimer( timer));
printf( "Processing time: %f (ms)\n", cutGetTimerValue( timer));
CUT_SAFE_CALL( cutDeleteTimer( timer));

free( h_idata);
free( h_odata);
CUDA_SAFE_CALL(cudaFree(d_idata));
CUDA_SAFE_CALL(cudaFree(d_odata));

}