// example1.cpp : Defines the entry point for the console application.
//
#include “stdafx.h”
#include <assert.h>
#include <cuda.h>
#include<stdio.h>
void matmul(float **a,float **b,float **c,int N);
void matmul(float **a,float **b,float **c,int N)
{
int i,j,k;
for(i=0;i<N;i++)
{
for(j=0;j<N;j++)
{
for(k=0;k<N;k++)
{
c[i][j] += a[i][k]*b[k][j];
}
}
}
for(i=0;i<N;i++)
{
for(j=0;j<N;j++)
{
printf("%f\t",a[i][j]);
}
printf("\n");
}
for(i=0;i<N;i++)
{
for(j=0;j<N;j++)
{
printf(“%f\t”,b[i][j]);
}
printf("\n");
}
for(i=0;i<N;i++)
{
for(j=0;j<N;j++)
{
printf("%f\t",c[i][j]);
}
printf("\n");
}
}
global void matmulondevice(float **a,float **b,float **c)
{
int i = threadIdx.x;
int j = threadIdx.y;
c[i][j]= a[i][j]+b[i][j];
}
int main(void)
{
float **a,**b,**c;
float **a_d,**b_d,**c_d;
int N = 2;
a = (float**)malloc(sizeof(float *)*N);
b = (float**)malloc(sizeof(float *)*N);
c = (float**)malloc(sizeof(float *)*N);
cudaMalloc((void **) &a_d, sizeof(float*)*N);
cudaMalloc((void **) &b_d, sizeof(float*)*N);
cudaMalloc((void **) &c_d, sizeof(float*)*N);
int i;
int j;
for(i=0;i<N;i++)
{
a[i] = (float*)malloc(sizeof(float)*N);
b[i] = (float*)malloc(sizeof(float)*N);
c[i] = (float*)malloc(sizeof(float)*N);
}
for(i=0;i<N;i++)
{
cudaMalloc((void **) &a_d[i], sizeof(float)*N);
cudaMalloc((void **) &b_d[i], sizeof(float)*N);
cudaMalloc((void **) &c_d[i], sizeof(float)*N);
}
for(i=0;i<N;i++)
{
for(j=0;j<N;j++)
{
printf(“Enter the %d,%d element”,i,j);
scanf(“%f”,&a[i][j]);
scanf(“%f”,&b[i][j]);
c[i][j]=0;
}
}
matmul(a,b,c,N);
for(i=0;i<N;i++)
{
for(j=0;j<N;j++)
{
printf(“Enter the %d,%d element”,i,j);
scanf(“%f”,&a[i][j]);
scanf(“%f”,&b[i][j]);
c[i][j]=0;
}
}
dim3 dimBlock(2,2);
matmulondevice<<<1,dimBlock>>>(a_d,b_d,c_d);
for(i=0;i<N;i++)
{
cudaMemcpy(c[i], c_d[i], sizeof(float)*N, cudaMemcpyDeviceToHost);
}
for(i=0;i<N;i++)
{
for(j=0;j<N;j++)
{
printf(“%f\t”,c[i][j]);
}
printf("\n");
}
free(a);free(B);free©;
cudaFree(a_d);cudaFree(b_d);cudaFree(c_d);
}
The C function works fine,
The kernel however is the problem, it gives 0 for all values.After the values are retrieved from the device and printed with the loop statement.
My guess is i am not accessing the thredid’s in the right way or i am not passing the values correctly to the kernel.
please help me