Hello, developer.
I want to make a simple code for showing GPU process is much faster than CPU process.
But I got different result which is CPU process is much faster than GPU process.
I know this is wrong result but i don’t know why…
I’m using a Visual Studio 2017 with CUDA 10.0 toolkit.
--- General Information for device 0 ---
Name: GeForce GTX 1060
Compute capability: 6.1
Clock rate: 1670500
Device copy overlap: Enabled
Kernel execution timeout : Enabled
--- Memory Information for device 0 ---
Total global mem: -2147483648
Total constant Mem: 65536
Max mem pitch: 2147483647
Texture Alignment: 512
--- MP Information for device 0 ---
Multiprocessor count: 10
Shared mem per mp: 49152
Registers per mp: 65536
Threads in warp: 32
Max threads per block: 1024
Max thread dimensions: (1024, 1024, 64)
Max grid dimensions: (2147483647, 65535, 65535)
#include <iostream>
#include <time.h>
#include <windows.h>
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#define N 10
//#define N 10000
using namespace std;
__global__ void add2(int *a, int *b, int *c) {
// GPU block from grid sector
int tid = blockIdx.x; // checking the data of index = if you insert min of N, you will get slow result from CPU. But if you put big number, this show much faster than CPU
// GPU thread
//int tid = threadIdx.x; // Same result as blockIdx.x
// GPU unexpected vector // Same result as above
//int tid = threadIdx.x + blockIdx.x*blockDim.x;
if (tid < N) {
c[tid] = a[tid] + b[tid];
}
}
void add(int *a, int *b, int *c) {
int tid = 0;
while (tid < N) {
c[tid] = a[tid] + b[tid];
tid += 1;
}
}
int main() {
// Values for time duration
LARGE_INTEGER tFreq, tStart, tEnd;
cudaEvent_t start, stop;
float tms, ms;
int a[N], b[N], c[N]; // CPU values
int *dev_a, *dev_b, *dev_c; // GPU values----------------------------------------------
// Creating alloc for GPU--------------------------------------------------------------
cudaMalloc((void**)&dev_a, N * sizeof(int));
cudaMalloc((void**)&dev_b, N * sizeof(int));
cudaMalloc((void**)&dev_c, N * sizeof(int));
// Fill 'a' and 'b' from CPU
for (int i = 0; i < N; i++) {
a[i] = -i;
b[i] = i * i;
}
// Copy values of CPU to GPU values----------------------------------------------------
cudaMemcpy(dev_a, a, N * sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(dev_b, b, N * sizeof(int), cudaMemcpyHostToDevice);
//////////////////////////////////////
QueryPerformanceFrequency(&tFreq); // Frequency set
QueryPerformanceCounter(&tStart); // Time count Start
// CPU operation
add(a, b, c);
//////////////////////////////////////
QueryPerformanceCounter(&tEnd); // TIme count End
tms = ((tEnd.QuadPart - tStart.QuadPart) / (float)tFreq.QuadPart) * 1000;
//////////////////////////////////////
// show result of CPU
cout << fixed;
cout.precision(50);
cout << "CPU Time=" << tms << endl << endl;
/*for (int i = 0; i < N; i++) {
printf("CPU calculate = %d + %d = %d\n", a[i],b[i],c[i]);
}*/
cout << endl;
///////////////////////////////////////
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
// GPU operatinog---------------------------------------------------------------------
add2 <<<N,1 >>> (dev_a, dev_b, dev_c); // block
//add2 << <1,N >> > (dev_a, dev_b, dev_c); // Thread
//add2 << <128,128>> > (dev_a, dev_b, dev_c); // vector
///////////////////////////////////////
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&ms, start, stop);
///////////////////////////////////////
// show result of GPU
cudaMemcpy(c, dev_c, N * sizeof(int), cudaMemcpyDeviceToHost);
cout << fixed;
cout.precision(50);
cout << "GPU Time=" << ms << endl << endl;
/*for (int i = 0; i < N; i++) {
printf("GPU calculate = %d + %d = %d\n", a[i], b[i], c[i]);
}*/
//Free GPU values
cudaFree(dev_a);
cudaFree(dev_b);
cudaFree(dev_c);
return 0;
}
Please help me.
Thank you.
simple_time_duration_GPU_CPU.cu (3.11 KB)