I’ve made a kernel which speed up the CPU calculation from 1 hour to about 1,5minutes. I thought wow that is nice. But when I saw I was not writing the results in my kernel I changed it. After writing the result the performance dropped very much.
I went from 1,5min to almost 6… Is this a non coalesced memory thing or what?
Maybe someone can tell me how to speed this up or implement the coalesced memory thingy… I read something about it on the supercomputing 2007 slides but couldn’t figure it out. I hope someone can help me with it…
Here’s my code: TemperatureIteration.cu
//Includes
#include <stdio.h>
//#include "typedefs.h" //needed for byte
#include "../../../llib/src/typedefs.h"
#include "TemperatureIteration.h"
__device__ bool mediumFixed(int medium);
__device__ float getBioheat(int medium);
__global__ void stepTwoTimeLevelKernel(int gridX,
float* T,
const int* v,
const float* source,
const float* sink,
const float dt,
int dimx, int dimy, int dimz)
{
// set blockindex
int bx = blockIdx.x;
int by = blockIdx.y;
// set threadindex
int tx = threadIdx.x;
int ty = threadIdx.y;
// determine x, y and z pending on threadindex and blocksize
int i = tx + (bx % gridX) * blockDim.x;
int j = ty + __mul24(by, blockDim.y);
int k = bx / gridX;
//printf("(i,j,k): %i,%i,%i\n",i,j,k);
if( i < dimx && j < dimy && k < dimz){
int dimxdimy = __mul24(dimx,dimy);
int index = i + __mul24(j,dimx) + __mul24(k,dimxdimy);
//int nMedia = v.numberOfMedia();
int m0 = v[index];
//printf("v[%i]: %i, m0: %i\n",index,v[index],m0);
// Check if temperature is fixed.
if(!mediumFixed(m0)){
int offset = m0 * numberOfMedia;
float C1 = coefX_[offset + v[index-1]];
float C2 = coefX_[offset + v[index+1]];
float C3 = coefY_[offset + v[index-dimx]];
float C4 = coefY_[offset + v[index+dimx]];
float C5 = coefZ_[offset + v[index-dimxdimy]];
float C6 = coefZ_[offset + v[index+dimxdimy]];
float C7 = -(C1 + C2 + C3 + C4 + C5 + C6);
float TempAtCentre = T[index];
float deltaT =
C1 * T[index-1] + C2 * T[index+1] +
C3 * T[index-dimx] + C4 * T[index+dimx] +
C5 * T[index-dimxdimy] + C6 * T[index+dimxdimy] +
C7 * TempAtCentre;
if (source != 0) {
deltaT += source[index];
}
deltaT -= sink[index];
if (0 < getBioheat(m0)) {
deltaT -= getBioheat(m0) * T[index];
}
// Now convert deltaT to a real temperature delta.
deltaT *= dt * coef_[m0];
T[index] += deltaT;
}
}
}
__device__ bool mediumFixed(int medium){
for(int i = 0; i < numberOfFixedMedia; i++){
if(medium == fixedMedia[i]) return true;
}
return false;
}
__device__ float getBioheat(int medium){
//printf("medium: %i\n",medium);
int mediumIndex = -1;
for(int i = 0; i < numberOfPresentMedia; i++){
if(medium == presentMedia[i]){
mediumIndex = i;
continue;
}
}
//printf("bioheat[%i]: %f\n", mediumIndex, bioheat[mediumIndex]);
return bioheat[mediumIndex];
}
//WRAPPER
void stepTwoTimeLevelGPU(float* T,
int* v,
float* source,
float* sink,
float dt,
int dimx, int dimy, int dimz,
dim3 dimGrid, dim3 dimBlock)
{
stepTwoTimeLevelKernel<<<dimGrid, dimBlock>>>((dimGrid.x / dimz),
T,
v,
source,
sink,
dt,
dimx, dimy, dimz);
cudaThreadSynchronize();
cudaError_t error = cudaGetLastError();
if (error != cudaSuccess)
printf("error :%s\n",cudaGetErrorString(error));
// check if kernel execution generated and error
}
TemperatureIteration.h
#ifndef __TEMPERATURE_ITERATION_H__
#define __TEMPERATURE_ITERATION_H__
#include "../../../llib/src/typedefs.h"
#include "cuda/cuda.h"
#include "cuda_runtime.h"
//typedef unsigned char byte;
#define N 73
__constant__ int numberOfMedia;
__constant__ int numberOfFixedMedia;
__constant__ int numberOfPresentMedia;
__constant__ int fixedMedia[N];
//coefX,Y,Z size is normally numberOfMedia^2 to make sure it will fit we take 128^2
__constant__ float coefX_[N*N];
__constant__ float coefY_[N*N];
__constant__ float coefZ_[N*N];
__constant__ float coef_[N];
__constant__ float bioheat[N];
__constant__ int presentMedia[N];
extern "C" {
void stepTwoTimeLevelGPU(float* T,
int* v,
float* source,
float* sink,
float dt,
int dimx, int dimy, int dimz,
dim3 dimGrid, dim3 dimBlock);
}
#endif