Hello!
Im trying to write a program that receives some data, calculate using Runge-Kutta 4 method and print to a txt file.
#define TAM_MAX 1e7
...
__device__ float fi(float x,float z, float p1,float p2,float nada){
float k1,k2,k3,k4;
if(threadIdx.x == 0){
k1 = dxdt(x , z , p1);
k2 = dxdt(x+k1*h/2 , z+k1*h/2 , p1+k1*h/2);
k3 = dxdt(x+k2*h/2 , z+k2*h/2 , p1+k2*h/2);
k4 = dxdt(x+k3*h , z+k3*h , p1+k3*h);
}
if(threadIdx.x == 1){
k1 = dzdt(x,z,p2);
k2 = dzdt(x+k1*h/2,z+k1*h/2,p2+k1*h/2);
k3 = dzdt(x+k2*h/2,z+k2*h/2,p2+k2*h/2);
k4 = dzdt(x+k3*h,z+k3*h,p2+k3*h);
}
if(threadIdx.x == 2){
k1 = dp1dt(x,z);
k2 = dp1dt(x+k1*h/2,z+k1*h/2);
k3 = dp1dt(x+k2*h/2,z+k2*h/2);
k4 = dp1dt(x+k3*h,z+k3*h);
}
if(threadIdx.x == 3){
k1 = dp2dt(x,z);
k2 = dp2dt(x+k1*h/2,z+k1*h/2);
k3 = dp2dt(x+k2*h/2,z+k2*h/2);
k4 = dp2dt(x+k3*h,z+k3*h);
}
return h*(k1+2*k2+2*k3+k4)/6;
}
...
//Inside my main:
dim3 dimBlock(4,Raios);
dim3 dimGrid(Pos0,1);
k_iteracao<<<dimGrid,dimBlock>>>(d_inx,d_inteta,p,d_p1,d_p2,Pos0,Raios,d_X);
printf("%s\n",cudaGetErrorString(cudaGetLastError()));
printf("%s\n",cudaGetErrorString(cudaThreadSynchronize()));
...
cudaMemcpy(h_X,d_X,TAM_MAX*sizeof(float),cudaMemcpyDeviceToHost);
...
__global__ void k_iteracao(float *d_inx,float *d_inteta, float nada,float *d_p1,float *d_p2,int pos, int raios ,float *d_X){
__shared__ float Y[4];
int i = 0,posicao;
float t = 0 , t0 = 0;
Y[0] = d_inx[blockIdx.x];
Y[1] = 0;
Y[2] = d_p1[threadIdx.y];
Y[3] = d_p2[threadIdx.y];
while((Y[0] <= largurad) && (Y[0] >= largurae) && (Y[1] <= profundidade) && (t <= tempolimite)){
//Cada thread resolve a iteracao de acordo com a sua ID e depois espera os outros
Y[threadIdx.x] = Y[threadIdx.x]+fi(threadIdx.x,Y[0],Y[1],Y[2],Y[3],h);
// if( threadIdx.x == 1) printf("Tiro: %d - Raio: %d - x: %f, z: %f, p1: %f, p2: %f\n",blockIdx.x,threadIdx.y,Y[0],Y[1],Y[2],Y[3]);
if( threadIdx.x == 0 && threadIdx.y == 0 ){
posicao = blockIdx.x*(int)(TAM_MAX / pos) + threadIdx.y*(int)(TAM_MAX / (raios*pos));
d_X[posicao + (i*5+0)] = blockIdx.x;
d_X[posicao + (i*5+1)] = threadIdx.y;
d_X[posicao + (i*5+2)] = t;
d_X[posicao + (i*5+3)] = Y[0];
d_X[posicao + (i*5+4)] = Y[1];
// printf("x: %f,z: %f,p1: %f,p2: %f\n",d_X[posicao + (i*5+0)],d_X[posicao + (i*5+1)],d_X[posicao + (i*5+2)],d_X[posicao + (i*5+3)],d_X[posicao + (i*5+4)]);
i++;
}
t = t + h;
}
d_X[blockIdx.x*(int)(TAM_MAX / pos) + threadIdx.y*(int)(TAM_MAX / (raios*pos)) + i*5 + 0] = -1;
}
When i execute it i receive 2 error messages after kernel execution:
"invalid configuration argument
the launch timed out and was terminated"
d_inx,d_inteta,d_p1,d_p2 are arrays with all data received.
pos = 48 and raios = 18.
Each block should do the job for only one d_inx element, and each threadIdx.y layer should calculate only one d_inteta element.
Im trying to make this code work for days. Looks like cudaMemcpy isnt copying any data from device to host.
Can someone please help me find what is wrong?
PS: Sorry for my bad english.