performing computing of 2d array in cuda

Hi, I’m sorry if my question is too obvious. I just learn cuda this morning. So, I have this code in C, and I want to implement cuda to improve the execution time:

void calc(float **src, float **dst, int w, int h)
    for(int x = 0; x<w; x++)
    for(int y = 0; y<h; y++)
        int k = 0;
        dst[x][y] = src[y][x] + src[x][y] + src[x][x] + src[y][y];

I try to implement this code in cuda, but I’m not sure if I’m doing the right thing:

__global__ void process(hostSrc,src, int w,int h)
    //I'm completely clueless what should I put in here

void calc(float **src, float **dst, int w, int h)
     float* hostSrc, *hostDst;
     size_t pitch;
     cudaMallocPitch(&hostSrc,&pitch, w*sizeof(float), h);
     cudaMallocPitch(&hostDst,&pitch, w*sizeof(float), h);
     cudaMemcpy2D(&hostSrc, &pitch, &src, w*sizeof(float), w*sizeof(float),h,cudaMemcpyHostToDevice);
     cudaMemcpy2D(&hostDst, &pitch, &dst, w*sizeof(float), w*sizeof(float),h,cudaMemcpyHostToDevice);
     dim3 grid, block;
     block.x = ...;
     grid.x = ...;
     process<<<grid,block>>>(hostSrc,src, w,h); //I don't know how many block and thread should I use. 
     cudaMemcpy2D(&dst, w*sizeof(float), &hostDst, &pitch, w*sizeof(float),h, cudaMemcpyDeviceToHost);
     cudaFree(hostSrc); cudaFree(hostDst);

can anyone help me? or have any suggestion? thx

If you are going ‘process’ given your cpu function:

dst[y] = src[y] + src[y] + src + src[y][y];

this would mean the width==height, because with a non-square 2-d array, with dimensions lets say 2x3, there is no valid [3][3](or [2][2] with 0-indexing) location, but the code would try to read from that address in error.

here is quick easy non-optimal way of accomplishing the same thing:

#include <algorithm>
#include <iostream>
#include <fstream>
#include <sstream>
#include <utility>
#include <cstdlib>
#include <cstdio>
#include <cstring>
#include <string>
#include <cmath>
#include <ctime>
#include <cuda.h>
#include <math_functions.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <Windows.h>
#include <MMSystem.h>
#pragma comment(lib, "winmm.lib")
#include <crtdbg.h>
using namespace std;

#define THREADS 256
#define WIDTH 4096

bool InitMMTimer(UINT wTimerRes);
void DestroyMMTimer(UINT wTimerRes, bool init);
void _generate_random_values_float(float *A, const int size);

__global__ void process(const float *src, float *dst, const int N){
	int offset=blockIdx.x*blockDim.x+threadIdx.x;
	int y=offset/N,x=offset%N;
	if(y<N && x<N){

void calc(const float *src, float *dst, const int N){
	for(int y=0;y<N;y++)for(int x=0;x<N;x++){
		int offset=y*N+x;

int main(){
	char ch;

	const int NN=HEIGHT*WIDTH,lda=max(HEIGHT,WIDTH);
	const int numbytes=NN*sizeof(float);
	float *SRC=(float *)malloc(numbytes);
	float *H_RESULT=(float *)malloc(numbytes);
	float *D_RESULT=(float *)malloc(numbytes);
	float *D_SRC,*D_DST;

	cudaError_t err=cudaMalloc((void **)&D_SRC,numbytes);
	if(err!=cudaSuccess){printf("%s in %s at line %d\n",cudaGetErrorString(err),__FILE__,__LINE__);}

	err=cudaMalloc((void **)&D_DST,numbytes);
	if(err!=cudaSuccess){printf("%s in %s at line %d\n",cudaGetErrorString(err),__FILE__,__LINE__);}

	UINT wTimerRes = 0;
	bool init = InitMMTimer(wTimerRes);
	DWORD startTime=0,GPUtime=0,CPUtime=0;
	startTime = timeGetTime();

	err=cudaMemcpy(D_SRC,SRC,numbytes,cudaMemcpyHostToDevice);//copy array values from host into device memory
	if(err!=cudaSuccess){printf("%s in %s at line %d\n",cudaGetErrorString(err),__FILE__,__LINE__);}
	err = cudaThreadSynchronize();
	if(err!=cudaSuccess){printf("%s in %s at line %d\n",cudaGetErrorString(err),__FILE__,__LINE__);}

	err=cudaMemcpy(D_RESULT,D_DST,numbytes,cudaMemcpyDeviceToHost);//copy back result to host memory
	if(err!=cudaSuccess){printf("%s in %s at line %d\n",cudaGetErrorString(err),__FILE__,__LINE__);}

	DWORD endTime = timeGetTime();
	cout<<"\nGPU total time(copy from host, process, copy back result): "<<float(GPUtime)/1000.0f<< "seconds.\n";
	DestroyMMTimer(wTimerRes, init);

	wTimerRes = 0;
	init = InitMMTimer(wTimerRes);
	startTime = timeGetTime();


	endTime = timeGetTime();
	cout<<"\nCPU timing: "<<float(CPUtime)/1000.0f<<" seconds.\n";
	DestroyMMTimer(wTimerRes, init);
	float max_dif=0.0f;
	for(int y=0;y<HEIGHT;y++)for(int x=0;x<WIDTH;x++){
		float h=H_RESULT[y*WIDTH+x],d=D_RESULT[y*WIDTH+x];

	if(err!=cudaSuccess){printf("%s in %s at line %d\n",cudaGetErrorString(err),__FILE__,__LINE__);}
	if(err!=cudaSuccess){printf("%s in %s at line %d\n",cudaGetErrorString(err),__FILE__,__LINE__);}
	return 0;

bool InitMMTimer(UINT wTimerRes){
	if (timeGetDevCaps(&tc, sizeof(TIMECAPS)) != TIMERR_NOERROR) {return false;}
	wTimerRes = min(max(tc.wPeriodMin, 1), tc.wPeriodMax);
	return true;

void DestroyMMTimer(UINT wTimerRes, bool init){

void _generate_random_values_float(float *A, const int size){
	for(int i=0;i<size;i++){
	cout<<"\nFilled vector array with random float values\n";

That seems to be correct, but there are much faster ways to accomplish the same result.
Even this crude version is still faster than cpu version by 6-10x.