I make a simple program that performs dot-product of two vectors below.
Each element of the two vectors is initialized with 0.001.
As I showed the result below, it sometimes produce wrong results.
Can anybody help me?
#include <stdio.h>
#include <stdlib.h>
#include <sys/time.h>
#include <math.h>
void PrintUsage(const char* progName) {
printf("Usage: %s <n> <niter> <gangs> <vectors>\n", progName);
printf(" <n> : the number of elements.\n");
printf(" <ntier> : the number of iterations.\n");
printf(" <gangs> : the number of gangs.\n");
printf(" <vectors>: the number of vectors.\n");
exit(0);
}
double get_time(void) {
struct timeval tv;
gettimeofday(&tv, NULL);
return tv.tv_sec + tv.tv_usec/1.0e6;
}
int verify(float* M_C, float* M_C2, int n) {
unsigned wrong_count = 0;
for(int i = 0; i < n; ++i ) {
if( fabs(M_C[i] - M_C2[i]) > 0.0001 ) {
if( wrong_count < 5 ) {
printf("[%d] DIFF [%f] [%f]\n", i, M_C[i], M_C2[i]);
} else {
return 0;
}
}
}
return wrong_count == 0;
}
float run_omp(float* M_A, float* M_B, int n, int niter, int gangs, int vectors) {
int i;
int iter;
double start_time, end_time;
double accumulated_time = 0.0;
float sum = 0.0f;
#pragma acc data copyin(M_A[0:n], M_B[0:n])
{
for(iter = 0; iter < niter; ++iter ) {
sum = 0.0;
start_time = get_time();
#pragma acc parallel loop\
private(i) \
firstprivate(n) \
present(M_A[0:n], M_B[0:n]) \
reduction(+:sum) \
gang(gangs) vector(vectors)
for( i = 0; i < n; ++i )
{
sum += M_A[i] * M_B[i];
}
end_time = get_time();
accumulated_time += end_time - start_time;
}
}
printf("%d|%d|Elapsed time (ACC) | %f\n", gangs, vectors, accumulated_time);
return sum;
}
int main(int argc, const char* argv[]) {
if( argc != 5 ) {
PrintUsage(argv[0]);
}
int n = atoi(argv[1]);
int niter = atoi(argv[2]);
int gangs = atoi(argv[3]);
int vectors = atoi(argv[4]);
if( n <= 0 || gangs <= 0 || niter <= 0 || vectors <= 0 ) {
PrintUsage(argv[0]);
}
printf("Calculating %d elements...\n", n);
float* M_A = (float*)malloc(n*sizeof(float));
float* M_B = (float*)malloc(n*sizeof(float));
for( int i = 0; i < n; ++i ) {
M_A[i] = 0.001;
M_B[i] = 0.001;
}
float omp_sum = run_omp(M_A, M_B, n, niter, gangs, vectors);
if( fabs(omp_sum - (M_A[0]*M_B[0]*n)) < 0.01 ) {
printf("VERIFICATION: SUCCESSFUL.\n");
} else {
printf("VERIFICATION: FAILED!\n");
printf("omp_sum = %f. should be %f\n", omp_sum, (M_A[0]*M_B[0]*n));
}
return 0;
}
Result of pgaccelinfo
OpenCL Platform: AMD Accelerated Parallel Processing
OpenCL Vendor: Advanced Micro Devices, Inc.Device Number: 0
Device Name: Hawaii
Available: Yes
Compiler Available: Yes
Board Name:
Device Version: OpenCL 2.0 AMD-APP (1800.5)
Global Memory Size: 8535408640
Maximum Object Size: 4244635648
Global Cache Size: 16384
Free Memory: 8328319000
Max Clock (MHz): 1055
Compute Units: 44
SIMD Units: 4
SIMD Width: 16
GPU Cores: 2816
Wavefront Width: 64
Constant Memory Size: 65536
Local Memory Size: 32768
Workgroup Size: 256
Address Bits: 64
ECC Support: No
PGI Compiler Option: -ta=radeon:hawaii
Compilation
[junghyun@aidi03 openacc]$ make clean;make
rm -rf dotProduct dotProduct.o
pgc++ -acc -ta=radeon:hawaii -Minfo -c -o dotProduct.o dotProduct.cpp
run_omp(float *, float , int, int, int, int):
45, Generating copyin(M_A[:n],M_B[:n])
55, Generating present(M_A[:n],M_B[:n])
Accelerator kernel generated
Generating Radeon code
55, #pragma acc loop gang(gangs), vector(vectors) / global dim(0) local dim(0) */
Sum reduction generated for sum
pgc++ -acc -ta=radeon:hawaii -Minfo -o dotProduct dotProduct.o
Result
[junghyun@aidi03 openacc]$ ./dotProduct
Usage: ./dotProduct
: the number of elements.
: the number of iterations.
: the number of gangs.
: the number of vectors.
[junghyun@aidi03 openacc]$ ./dotProduct 1024 1 32 32
Calculating 1024 elements…
No protocol specified
No protocol specified
No protocol specified
32|32|Elapsed time (ACC) | 0.001944
VERIFICATION: SUCCESSFUL.
[junghyun@aidi03 openacc]$ ./dotProduct 10240 1 32 32
Calculating 10240 elements…
No protocol specified
No protocol specified
No protocol specified
32|32|Elapsed time (ACC) | 0.001643
VERIFICATION: FAILED!
omp_sum = 0.000000. should be 0.010240
[junghyun@aidi03 openacc]$ ./dotProduct 16777216 1 32 32
Calculating 16777216 elements…
No protocol specified
No protocol specified
No protocol specified
32|32|Elapsed time (ACC) | 0.015805
VERIFICATION: FAILED!
omp_sum = 0.000000. should be 16.777218