Hi to all. I’m new to this forum, cuda, and gpu programming.
My code is doing slotwise array computation, and until now, I ran this code sequentially on my cpu. I’m trying now to write it in CUDA, and run it in parallel on GPUs and I’m expecting to be quite faster.
I guess I’m doing something wrong, because it needed 56 microseconds to run the code on GPUs, and it takes around 7-8 microseconds to run it secuentially.
Please can someone take a look at my code, and check if there is some stupid/obvious mistake?
I will also attach my GPU specifications, as it is quite old FX.
Here is my deviceQuery output:
CUDA Device Query (Runtime API) version (CUDART static linking)
Detected 1 CUDA Capable device(s)
Device 0: “Quadro FX 580”
CUDA Driver Version / Runtime Version 6.5 / 6.5
CUDA Capability Major/Minor version number: 1.1
Total amount of global memory: 511 MBytes (536150016 bytes)
( 4) Multiprocessors, ( 8) CUDA Cores/MP: 32 CUDA Cores
GPU Clock rate: 1125 MHz (1.12 GHz)
Memory Clock rate: 800 Mhz
Memory Bus Width: 128-bit
Maximum Texture Dimension Size (x,y,z) 1D=(8192), 2D=(65536, 32768), 3D=(2048, 2048, 2048)
Maximum Layered 1D Texture Size, (num) layers 1D=(8192), 512 layers
Maximum Layered 2D Texture Size, (num) layers 2D=(8192, 8192), 512 layers
Total amount of constant memory: 65536 bytes
Total amount of shared memory per block: 16384 bytes
Total number of registers available per block: 8192
Warp size: 32
Maximum number of threads per multiprocessor: 768
Maximum number of threads per block: 512
Max dimension size of a thread block (x,y,z): (512, 512, 64)
Max dimension size of a grid size (x,y,z): (65535, 65535, 1)
Maximum memory pitch: 2147483647 bytes
Texture alignment: 256 bytes
Concurrent copy and kernel execution: Yes with 1 copy engine(s)
Run time limit on kernels: Yes
Integrated GPU sharing Host Memory: No
Support host page-locked memory mapping: Yes
Alignment requirement for Surfaces: Yes
Device has ECC support: Disabled
Device supports Unified Addressing (UVA): No
Device PCI Bus ID / PCI location ID: 15 / 0
Compute Mode:
< Default (multiple host threads can use ::cudaSetDevice() with device simultaneously) >
deviceQuery, CUDA Driver = CUDART, CUDA Driver Version = 6.5, CUDA Runtime Version = 6.5, NumDevs = 1, Device0 = Quadro FX 580
Result = PASS
…and here is my code:
#include <stdio.h>
#include <stdlib.h>
//current capability 1.1
//2.0 capability needed for printf (in kernel)
//1.3 capapility needed for using double precision
__global__
void dvdt(int n, int ngenes, int egenes, float *v, float *vext, float *voutput, float *h, float *m, float *E, float *T, float *R, float *D, float *lambda, float *tau)
{
int i = blockIdx.x*blockDim.x + threadIdx.x;
int k, k2, j, l_rule=1;
float temp;
if (i < n) {
k = i % ngenes;
k2 = i % egenes;
temp = h[k];
temp += m[k];// * bcd.array[i]; // ap is nuclear index //Damjan: will not use this for now. We can add it later
for( j = 0; j < egenes; j++ ) {
temp += E[( k * egenes ) + j] * vext[(i-k2) + j]; //tu napravit da svaki thread ima jedan slot od v i jedan (odgovarajuci) slot od vext
}
for( j = 0; j < ngenes; j++ ) {
temp += T[( k * ngenes ) + j] * v[(i-k) + j];
}
//voutput[i] = temp;
//rsqrt(x) = 1/sqrt(x) using reciprocal sqrt is more efficient then using 1 / sqrt
voutput[i] = -lambda[k] * v[i] + l_rule * R[k] * 0.5 * (1 + temp * rsqrt( 1 + temp * temp ));
if( n != ngenes ) // if n == ngenes we have one nuc therefore we don't have diffusion
{ // then for multiple nuclei -> diffusion
if( i < ngenes )
{ // first anterior-most nucleus
// v[i - inp->zyg.defs.ngenes] - v[i] == 0
voutput[i] += D[k] * ( v[i + ngenes] - v[i] );
}
else if (i >= ngenes && i < n - ngenes)
{
voutput[i] += D[k] * ( ( v[i - ngenes] - v[i] ) + ( v[i + ngenes] - v[i] ) );
}
else
{ //i > n - ngenes && i <= n // last: posterior-most nucleus
// v[i + inp->zyg.defs.ngenes] - v[i] == 0
voutput[i] += D[k] * ( v[i - ngenes] - v[i] );
}
}
}
}
int main(void)
{
int ngenes = 4;
int egenes = 4;
int nnucs = 53;
int n = ngenes * nnucs;
// Error code to check return values for CUDA calls
cudaError_t err = cudaSuccess;
int blockSize = 128;
float *v, *voutput, *d_x, *d_z, *d_y, *vext, *rh, *rm, *rE, *rT, *rR, *rD, *rlambda, *rtau; //*bot,
float R[4] = { 0, 0, 0, 0 };
float T[16] = { -0.02875641, 0.03773355, -0.08696411, 0.02085833,
0.07752513, -0.05127861, 0.08351538, -0.03693460,
-0.02272315, -0.05630814, 0.03840626, 0.07150631,
0.02276078, -0.02350780, 0.01856495, -0.06116937
};
float E[16] = { -0.14654266, 0.01651517, 0.06442419, -0.05467585,
-0.12247616, -0.10550997, 0.10687539, 0.00000000,
0.02896215, -0.07365022, -0.06193835, 0.00000000,
0.02710513, -0.04491538, 0.10493181, 0.00000000
};
float m[4] = { 0, 0, 0, 0 };
float h[4] = { -2.5, -2.5, -2.5, -2.5 };
float D[4] = { 0.237, 0.3, 0.115, 0.3 };
float lambda[4] = { 13.76469068, 7.27890037, 11.63317492, 12.03105457 };
float tau[4] = { 6, 6, 6, 6 };
v = (float*)malloc(n * sizeof(float));
vext = (float*)malloc(n * sizeof(float));
voutput = ( float * ) malloc( n * sizeof( float ) );
//bot = ( double * ) calloc( n, sizeof( double ) );
//we need parameters h, m, E, T, lambda, R, D
//we need arrays v (main array that will be sent in parallel), v_ext
// next loop does the rest of the equation (R, Ds and lambdas)
// store result in vdot[]
cudaMalloc(&d_x, n*sizeof(float));
cudaMalloc(&d_y, n*sizeof(float));
cudaMalloc(&d_z, n*sizeof(float));
cudaMalloc(&rh, ngenes*sizeof(float));
cudaMalloc(&rm, 4*sizeof(float));
cudaMalloc(&rE, ngenes*egenes*sizeof(float));
cudaMalloc(&rT, ngenes*ngenes*sizeof(float));
cudaMalloc(&rD, ngenes*sizeof(float));
cudaMalloc(&rR, ngenes*sizeof(float));
cudaMalloc(&rlambda, ngenes*sizeof(float));
cudaMalloc(&rtau, ngenes*sizeof(float));
for (int i = 0; i < n; i++) {
v[i] = i+3;
vext[i] = i+5;
}
int error1 = cudaMemcpy(d_x, v, n*sizeof(float), cudaMemcpyHostToDevice);
int error2 = cudaMemcpy(d_y, vext, n*sizeof(float), cudaMemcpyHostToDevice);
int error4 = cudaMemcpy(rh, h, ngenes*sizeof(float), cudaMemcpyHostToDevice);
int error5 = cudaMemcpy(rm, m, ngenes*sizeof(float), cudaMemcpyHostToDevice);
int error6 = cudaMemcpy(rE, E, ngenes*egenes*sizeof(float), cudaMemcpyHostToDevice);
int error7 = cudaMemcpy(rT, T, ngenes*ngenes*sizeof(float), cudaMemcpyHostToDevice);
int error8 = cudaMemcpy(rR, R, ngenes*sizeof(float), cudaMemcpyHostToDevice);
int error9 = cudaMemcpy(rD, D, ngenes*sizeof(float), cudaMemcpyHostToDevice);
int error10 = cudaMemcpy(rlambda, lambda, ngenes*sizeof(float), cudaMemcpyHostToDevice);
int error11 = cudaMemcpy(rtau, tau, ngenes*sizeof(float), cudaMemcpyHostToDevice);
if (error1 != 0) {
printf("error copying x to device: %d\n", error1);
}
if (error2 != 0) {
printf("error copying y to device: %d\n", error2);
}
/*if (error3 != 0) {
printf("error copying D to device: %d\n", error3);
}*/
if (error4 != 0) {
printf("error copying h to device: %d\n", error4);
}
if (error5 != 0) {
printf("error copying m to device: %d\n", error5);
}
if (error6 != 0) {
printf("error copying E to device: %d\n", error6);
}
if (error7 != 0) {
printf("error copying T to device: %d\n", error7);
}
if (error8 != 0) {
printf("error copying R to device: %d\n", error8);
}
if (error9 != 0) {
printf("error copying D to device: %d\n", error9);
}
if (error10 != 0) {
printf("error copying lambda to device: %d\n", error10);
}
if (error11 != 0) {
printf("error copying tau to device: %d\n", error11);
}
// Perform SAXPY on 1M elements
//dvdt<<<(n+(blockSize-1))/blockSize, blockSize>>>(n, 2.0, d_x, d_y); //OLD //8 threads per block runs in parallel. If we have more threads they will wait for those to finish, as we have 8 threads per Multiprocessor, and 4 MPs on our gpu
int gridSize = (int)ceil((float)n/blockSize);
printf("gridsize = %d\n",gridSize);
float responseTime; //result will be in milliseconds
cudaEvent_t start; cudaEventCreate(&start); cudaEventRecord(start); cudaEventSynchronize(start);
cudaEvent_t stop; cudaEventCreate(&stop);
dvdt<<<gridSize, blockSize>>>(n, ngenes, egenes, d_x, d_y, d_z, rh, rm, rE, rT, rR, rD, rlambda, rtau);
cudaEventRecord(stop); cudaEventSynchronize(stop);
cudaEventElapsedTime(&responseTime, start, stop); //responseTime = elapsed time
printf("elapsed time = %lg us\n", (responseTime*1000)); //to get nanoseconds
err = cudaGetLastError();
if (err != cudaSuccess)
{
fprintf(stderr, "Failed to launch helloWorld kernel (error code %s)!\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
int error12 = cudaMemcpy(voutput, d_z, n*sizeof(float), cudaMemcpyDeviceToHost);
printf("%d\n", error12);
// Release device memory
cudaFree(d_x);
cudaFree(d_y);
cudaFree(d_z);
cudaFree(rh);
cudaFree(rm);
cudaFree(rE);
cudaFree(rT);
cudaFree(rR);
cudaFree(rD);
cudaFree(rlambda);
cudaFree(rtau);
// Release host memory
free(v);
free(vext);
free(voutput);
}
edit:
I edited the code a bit to correct few mistakes and apply some of your suggestions.