It is running well when n<1000,but it will show bug when the n>3000.
bug:when the n>3000. err=cudaMemcpy(c, cuda_c, sizeof(int)* n * n, cudaMemcpyDeviceToHost) will return 4
I don’t know what happen.please help me .
thank you!
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <iostream>
#include <stdlib.h>
#include <time.h>
#include <stdio.h>
#include <math.h>
int n = 3000;//n*n的据矩阵
using namespace std;
__global__ void aMatrix(const int *a, const int *b, int n,int *c) {
int col = blockIdx.x*blockDim.x + threadIdx.x;//x
int row = blockIdx.y*blockDim.y + threadIdx.y;//y
//预防越界
if (col < n && row < n) {
int sum = 0;
for (int i = 0; i < n; i++) {
sum += a[row * n + i] * b[i*n + col];
}
c[row * n + col] =sum;
}
}
void aMatrixCpu(const int *a, const int *b, int n, int *c) {
int sum = 0;
for (int row = 0; row < n; row++) {
for (int col = 0; col < n; col++) {
for (int i = 0; i < n; i++) {
sum += a[row * n + i] * b[i*n + col];
}
c[row * n + col] = sum;
}
}
}
int main() {
clock_t start, end,gpuEnd,cpuEnd;
start = clock();//程序开始计时
//随机生成二维
int *a, *b, *c, *temp;
a=(int*)malloc(sizeof(int) * n*n);
b=(int*)malloc(sizeof(int) * n*n);
c=(int *)malloc(sizeof(int) * n*n);
temp =(int *)malloc(sizeof(int) * n*n);
for (int i = 0; i < n; i++)
{
for (int j = 0; j < n; j++)
{
a[i*n+j]=(rand() % 20) ;
//a[i*n+j]=1 ;
}
}
for (int i = 0; i < n; i++)
{
for (int j = 0; j < n; j++)
{
b[i*n + j] = (rand() % 20) ;
//b[i*n + j] = 1;
}
}
//output Matrix
//if (n <= 10) {
// for (int i = 0; i < n; i++)
// {
// for (int j = 0; j < n; j++)
// {
// cout << a[i*n + j]<<",";
// }
// cout << endl;
// }
// cout << "-------------------------" << endl;
// for (int i = 0; i < n; i++)
// {
// for (int j = 0; j < n; j++)
// {
// cout <<b[i*n + j] << ",";
// }
// cout << endl;
// }
//}
int *cuda_a, *cuda_b,*cuda_c;
int N = n;
cudaError_t err;
cudaMalloc((void**)&cuda_a, sizeof(int)* n * n);
cudaMalloc((void**)&cuda_b, sizeof(int)* n * n);
cudaMalloc((void**)&cuda_c, sizeof(int)* n * n);
cout << "malloc success" << endl;
err = cudaMemcpy(cuda_a, a, sizeof(int)* n * n, cudaMemcpyHostToDevice);
if (err != cudaSuccess) {
cout << "Failed to copy the a data from host" << err << endl;
return;
}
err = cudaMemcpy(cuda_b, b, sizeof(int)* n * n, cudaMemcpyHostToDevice);
if (err != cudaSuccess) {
cout << "Failed to copy the b data from host" << err << endl;
return;
}
err = cudaMemcpy(cuda_c, c, sizeof(int)* n * n, cudaMemcpyHostToDevice);
if (err != cudaSuccess) {
cout << "Failed to copy the c data from host" << err <<endl;
return;
}
cout << "init succeess" << endl;
end = clock();//程序结束用时
double endtime = (double)(end - start) / CLOCKS_PER_SEC;
cout << "Init time:" << endtime * 1000 << "ms" << endl; //ms为单位
dim3 threadsPerBlock(16, 16);
cout << (N+ threadsPerBlock.x-1) / threadsPerBlock.x<< endl;
dim3 numBlocks((N+ threadsPerBlock.x-1) / threadsPerBlock.x,( N+ threadsPerBlock.y-1) / threadsPerBlock.y);
aMatrix <<<numBlocks, threadsPerBlock ,0>>> (cuda_a, cuda_b,n, cuda_c);
cudaDeviceSynchronize();
err=cudaMemcpy(c, cuda_c, sizeof(int)* n * n, cudaMemcpyDeviceToHost);
cudaFree(cuda_a);
cudaFree(cuda_b);
cudaFree(cuda_c);
if (err != cudaSuccess) {
cout << "Failed to copy the data from device" << err << endl;
return;
}
cout << a[0] <<","<< b[0]<<"," << c[0] << endl;
gpuEnd = clock();//程序结束用时
endtime = (double)(gpuEnd - end) / CLOCKS_PER_SEC;
cout << "GPU Total time:" << endtime * 1000 << "ms" << endl; //ms为单位
aMatrixCpu(a, b, n,c);
cout << a[0] << "," << b[0] << "," << c[0] << endl;
cpuEnd = clock();//程序结束用时
endtime = (double)(cpuEnd - gpuEnd) / CLOCKS_PER_SEC;
cout << "CPU Total time:" << endtime * 1000 << "ms" << endl; //ms为单位
getchar();
return 0;
}