Hardware: Intel(R) Core™ i3-2100 CPU @3.10GHz, RAM 4 Gb, GeForce GTX 580, Windows 7 64 bit.
+= is performing two global-memory transaction according to the Profiler.
Code +=.
#include <stdio.h>
#include <string.h>
#include <iostream>
using namespace std;
////////////////////////////////////////////////////////////////////////////////
// declaration, forward
void utilities(float* dst, int sizeY, int sizeX);
const int SIZE_MEMORY_DY = 1;
const int SIZE_MEMORY_DX = 32;
__global__ void utilitiesKernel(float* dst, size_t dstPitchElements, const int sizeY, const int sizeX) {
// Block index
int bx = blockIdx.x;
int by = blockIdx.y;
// Thread index
int tx = threadIdx.x;
int ty = threadIdx.y;
int i = bx * SIZE_MEMORY_DX + tx;
int j = by * SIZE_MEMORY_DY + ty;
if (i < sizeX && j < sizeY) {
//Usual operatoin are used.
dst[j * dstPitchElements + i] += 1;
}
}
int main(int argc, char** argv)
{
const int sizeY = 5000;
const int sizeX = 5000;
// allocate host memory for matrices A
unsigned int size_src = sizeY * sizeX;
unsigned int mem_size_src = sizeof(float) * size_src;
float* h_dst = (float*)malloc(mem_size_src);
for (int j = 0; j < sizeY; j++) {
for (int i = 0; i < sizeX; i++) {
h_dst[j * sizeX + i] = 0;
}
}
utilities(h_dst, sizeY, sizeX);
free(h_dst);
return 0;
}
void utilities(float* dst, int sizeY, int sizeX) {
// allocate device memory
float* d_dst;
size_t dst_pitch_bytes;
cudaMallocPitch((void**) &d_dst, &dst_pitch_bytes, sizeX * sizeof(float), sizeY);
size_t dst_pitch_elements = dst_pitch_bytes / sizeof(float);
//copy host memory to device
cudaMemcpy2D(d_dst, dst_pitch_bytes, dst, sizeX * sizeof(float), sizeX * sizeof(float), sizeY, cudaMemcpyHostToDevice);
dim3 threads(32, 1);
dim3 grid((sizeX + SIZE_MEMORY_DX - 1) / SIZE_MEMORY_DX, (sizeY + SIZE_MEMORY_DY - 1) / SIZE_MEMORY_DY);
utilitiesKernel<<<grid, threads>>>(d_dst, dst_pitch_elements, sizeY, sizeX);
cudaThreadSynchronize();
//copy device memory to host
cudaMemcpy2D(dst, sizeX * sizeof(float), d_dst, dst_pitch_bytes, sizeX * sizeof(float), sizeY, cudaMemcpyDeviceToHost);
cudaFree(d_dst);
}
Code atomicAdd.
#include <stdio.h>
#include <string.h>
#include <iostream>
using namespace std;
////////////////////////////////////////////////////////////////////////////////
// declaration, forward
void utilities(float* dst, int sizeY, int sizeX);
const int SIZE_MEMORY_DY = 1;
const int SIZE_MEMORY_DX = 32;
__global__ void utilitiesKernel(float* dst, size_t dstPitchElements, const int sizeY, const int sizeX) {
// Block index
int bx = blockIdx.x;
int by = blockIdx.y;
// Thread index
int tx = threadIdx.x;
int ty = threadIdx.y;
int i = bx * SIZE_MEMORY_DX + tx;
int j = by * SIZE_MEMORY_DY + ty;
if (i < sizeX && j < sizeY) {
//Atomic operations are used.
atomicAdd(&dst[j * dstPitchElements + i], 1);
}
}
int main(int argc, char** argv)
{
const int sizeY = 5000;
const int sizeX = 5000;
// allocate host memory for matrices A
unsigned int size_src = sizeY * sizeX;
unsigned int mem_size_src = sizeof(float) * size_src;
float* h_dst = (float*)malloc(mem_size_src);
for (int j = 0; j < sizeY; j++) {
for (int i = 0; i < sizeX; i++) {
h_dst[j * sizeX + i] = 0;
}
}
utilities(h_dst, sizeY, sizeX);
free(h_dst);
return 0;
}
void utilities(float* dst, int sizeY, int sizeX) {
// allocate device memory
float* d_dst;
size_t dst_pitch_bytes;
cudaMallocPitch((void**) &d_dst, &dst_pitch_bytes, sizeX * sizeof(float), sizeY);
size_t dst_pitch_elements = dst_pitch_bytes / sizeof(float);
//copy host memory to device
cudaMemcpy2D(d_dst, dst_pitch_bytes, dst, sizeX * sizeof(float), sizeX * sizeof(float), sizeY, cudaMemcpyHostToDevice);
dim3 threads(32, 1);
dim3 grid((sizeX + SIZE_MEMORY_DX - 1) / SIZE_MEMORY_DX, (sizeY + SIZE_MEMORY_DY - 1) / SIZE_MEMORY_DY);
utilitiesKernel<<<grid, threads>>>(d_dst, dst_pitch_elements, sizeY, sizeX);
cudaThreadSynchronize();
//copy device memory to host
cudaMemcpy2D(dst, sizeX * sizeof(float), d_dst, dst_pitch_bytes, sizeX * sizeof(float), sizeY, cudaMemcpyDeviceToHost);
cudaFree(d_dst);
}
+= works 4.7 mc.
atomicAdd works 2.7 mc.