[ask] array operation and memory allocation

Hi everyone,

I just learned CUDA, and I found something that makes me confused.

Maybe my questions are really simple. (noob detected :D)

I have this code with data structure like this:

#include <stdio.h>

//data strucutre

typedef struct{

	int column[22];

	int total;

}row;

typedef row table[100];

//count the sum of each row

void count(table *p){

	int i,j;

	int temp;

	for(i=0;i<100;i++){

		temp=0;

		for(j=0;j<22;j++){

			temp=temp+(*p)[i].column[j];

		}

		(*p)[i].total=temp;

	}

}

//main program

int main(){

	int i,j;

	table p;

	

	//initialize table

	for(i=0;i<100;i++){

		for(j=0;j<22;j++){

			p[i].column[j]=j;

		}

		p[i].total=0;

	}

	count(&p);

}

I’d like to calculate the sum of each row and put it in total data field.

My questions are simple:

  1. How to allocate data structure like that in device memory?

  2. How can I copy the data from host memory to device memory?

  3. How can I invoke the kernel to run count procedure on GPU?

Thanks :)

I’m sorry… I have found out the solution but I’m nor really sure.

Please let me know if there’s something wrong with this, thanks again :)

#include <stdio.h>

typedef struct{

	int column[22];

	int total;

}row;

// Device code

__global__ void VecAdd(row* A, int N){

	int i = blockDim.x * blockIdx.x + threadIdx.x;

	int j,temp;

	if (i < N){

		temp=0;

		for(j=0;j<22;j++){

			temp=temp+A[i].column[j];

		}

		A[i].total=temp;

	}

}

int main(){

	int i,j;

	int N = 100;

	size_t size = N * sizeof(row);

	// Allocate input vectors h_P in host memory

	row* h_P = (row*)malloc(size);

	

	// Initialize table data

	for(i=0;i<N;i++){

		for(j=0;j<22;j++){

			h_P[i].column[j]=j;

		}

		h_P[i].total=0;

	}

	// Allocate vectors d_P in device memory

	row* d_P;

	cudaMalloc((void**)&d_P, size);

	// Copy vectors from host memory to device memory

	cudaMemcpy(d_P, h_P, size, cudaMemcpyHostToDevice);

	// Invoke kernel (I'm still confused in this part)

	int threadsPerBlock = 256;								// why 256?

	int blocksPerGrid = (N+threadsPerBlock-1)/threadsPerBlock;		// why (N+threadsPerBlock-1)/threadsPerBlock ?

	VecAdd<<<blocksPerGrid, threadsPerBlock>>>(d_P, N);

	// Copy result from device memory to host memory

	cudaMemcpy(h_P, d_P, size, cudaMemcpyDeviceToHost);

	// Output

	for(i=0;i<N;i++){

		for(j=0;j<22;j++){

			printf("%d ",h_P[i].column[j]);

		}

		printf("Total=%d\n",h_P[i].total);

	}

	// Free device memory

	cudaFree(d_P);

	

	// Free host memory

	free(h_P);

}