Cuda code to run in a few months, but the analysis does not solve the problem.
Currently using the laptop built-in graphics card is Geforce 610m.
INPUT Picture size is 100X100. And that the output picture, Fringe_Pattren size is 200X200. Making the operation a hologram and I want to use the GPU.
I have been trying to create CGH. Using the GPU I want to improve the speed.
However, an error message is generated. [“cudaDeviceSynchronize returned error code 6 after launching addKerel!” testCuda failed! … ~ ~ Press ~ ~ key ~ ~ ~ …]
Four loops in the first place was the C + + source. I studied were made of two loops. However, an error occurs.
How to use Thread and block? I’m asking for help …
//---------------------------------------------------------------------------------------
Origonal code is
//---------------------------------------------------------------------------------------
#include<stdio.h>
#include<math.h>
#include
#include<boost/thread.hpp>
using namespace std;
using namespace boost;
#include <time.h>
#define height 200
#define width 200
#define Fringe_height 400
#define Fringe_width 400
#define ImageMul 20
#define PI 3.14159265358979323846
#define CGHSCALE 1/255
#define DEFAULTDEPTH 0.5
#define Intensity 5//Light
#define RADIANS PI/180
#define THETA 0.544
#define WaveNum 9926043.4154818337
#define P 0.0000104
int x,y,z,k,l;
float R,DEPTH;
float **Fringe_Pattern, **Image;
FILE *read_file; // file read
FILE *write_file;
void part1()
{
for(k=0; k<Fringe_width; k++){
for(l=0; l<Fringe_height; l++){
for(int i=0; i<width; i++){
for(y=0; y<height; y++){
if(Image[i][y] > 0){
DEPTH = (255-Image[i][y]) * CGHSCALE + DEFAULTDEPTH;
R = sqrt(DEPTHDEPTH + (kP-iPImageMul)(kP-iPImageMul) + (lP-yPImageMul)(lP-yP*ImageMul));
Fringe_Pattern[k][l] += Image[i][y]cos(WaveNumR);
}
}
}
}
}
}
void main()
{
unsigned char tmp;
int temp;
float max=0, min=0;
float max_1=0, min_1=0;
float Resize_Temp = 0;
Image = new float *[width]; // Input_Image
Fringe_Pattern = new float *[Fringe_width];
// Digital_Hologram
for(x=0; x<width; x++){
Image[x] = new float [height];
}
for(x=0; x<Fringe_width; x++){
Fringe_Pattern[x] = new float [Fringe_height];
}
read_file = fopen("image/002.raw", "rb");
for(x=0; x<width; x++){
for(y=0; y<height; y++){
fread(&tmp, 1, 1, read_file);
Image[x][y] = (int)tmp;
}
}
for(k=0; k<Fringe_width; k++){
for(l=0; l<Fringe_height; l++){
Fringe_Pattern[k][l] = 0;
}
}
clock_t start,end;
double duration;
start=clock();
////////////////////////////////////////////////
////////////////////////////////////////////////
thread t_part1(&part1);
t_part1.join();
////////////////////////////////////////////////
////////////////////////////////////////////////
end=clock();
duration = ((double)(end - start)/CLOCKS_PER_SEC );
cout << "Time "<<duration<<endl;
for(x=0; x<Fringe_width; x++){
for(y=0; y<Fringe_height; y++){
if (Fringe_Pattern[x][y] > max){
max = Fringe_Pattern[x][y];
}
if (Fringe_Pattern[x][y] < min){
min = Fringe_Pattern[x][y];
}
}
}
for(x=0; x<Fringe_width; x++){
for(y=0; y<Fringe_height; y++){
Fringe_Pattern[x][y] = ((Fringe_Pattern[x][y]-min)/(max-min))*255;
}
}
write_file = fopen("result/rabbit_imageMul20.raw", "wb"); //Write File
for(x=0; x<Fringe_width; x++){
for(y=0; y<Fringe_height; y++){
temp = Fringe_Pattern[x][y];
fwrite(&temp, 1, 1, write_file);
}
}
}
//-----------------------------------------------------------------------------------------------
Changed cuda code is
//-----------------------------------------------------------------------------------------------
#include “cuda_runtime.h”
#include “device_launch_parameters.h”
#include <stdio.h>
#include <math.h>
#include <time.h>
#include <stdlib.h>
#define height 200
#define width 200
#define Fringe_height 100
#define Fringe_width 100
#define ImageMul 20
#define CGHSCALE 1/255
#define DEFAULTDEPTH 0.5
#define WaveNum 9926043.4154818337
#define P 0.0000104
cudaError_t testCuda(float *Image, float *Fringe_Pattern);
global void testKernel(float *dev_Image, float *dev_Fringe_Pattern)
{
int a, b, i, j, x, y;
float DEPTH, R;
for(a=0; a<Fringe_height*Fringe_width; a++)
{
i=a/Fringe_width;
j=a%Fringe_width;
for(b=0; b<height*width; b++)
{
x=b/width;
y=b%width;
if(dev_Image[x*width+y] > 0)
{
DEPTH = (255-dev_Image[x][y]) * CGHSCALE + DEFAULTDEPTH;
R = sqrt(DEPTH*DEPTH + (i*P-x*P*ImageMul)*(i*P-x*P*ImageMul) + (j*P-y*P*ImageMul)*(j*P-y*P*ImageMul));
dev_Fringe_Pattern[i][j] += dev_Image[x][y]*cos(WaveNum*R);
}
}
}
}
int main()
{
unsigned char tmp;
int temp;
int i, j;
float max=0, min=0;
clock_t start, end;
double duration;
FILE *fin, *fout;
float *Fringe_Pattern, *Image;
start=clock();
Image = (float*)calloc(sizeof(float), height*width);
Fringe_Pattern = (float*)calloc(sizeof(float), Fringe_height*Fringe_width);
//fin = fopen("input.raw", "rb");
fin = fopen("resize_rabbit.raw", "rb");
for(i=0; i<height; i++)
{
for(j=0; j<width; j++)
{
fread(&tmp, 1, 1, fin);
Image[i*width+j] = (int)tmp;
}
}
//cudaError_t cudaStatus = testCuda(Image, Fringe_Pattern);
//if (cudaStatus != cudaSuccess) {
// fprintf(stderr, "testCuda failed!");
// return 1;
//}
for(i=0; i<Fringe_height; i++)
{
for(j=0; j<Fringe_width; j++)
{
if(Fringe_Pattern[i*Fringe_width+j] > max)
max = Fringe_Pattern[i*Fringe_width+j];
if(Fringe_Pattern[i*Fringe_width+j] < min)
min = Fringe_Pattern[i*Fringe_width+j];
}
}
for(i=0; i<Fringe_height; i++)
{
for(j=0; j<Fringe_width; j++)
Fringe_Pattern[i*Fringe_width+j] = ((Fringe_Pattern[i*Fringe_width+j]-min)/(max-min))*255;
}
fout = fopen("result.raw", "wb");
for(i=0; i<Fringe_height; i++)
{
for(j=0; j<Fringe_width; j++)
{
temp = Fringe_Pattern[i*Fringe_width+j];
fwrite(&temp, 1, 1, fout);
}
}
cudaStatus = cudaDeviceReset();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaDeviceReset failed!");
return 1;
}
free(Image);
free(Fringe_Pattern);
fclose(fin);
fclose(fout);
end=clock();
duration = ((double)(end - start)/CLOCKS_PER_SEC );
printf("Time : %f\n", duration);
return 0;
}
// Helper function for using CUDA to add vectors in parallel.
cudaError_t testCuda(float *Image, float *Fringe_Pattern)
{
float *dev_Image, *dev_Fringe_Pattern;
cudaError_t cudaStatus;
cudaStatus = cudaSetDevice(0);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?");
goto Error;
}
cudaStatus = cudaMalloc((void**)&dev_Image, height*width*sizeof(float));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
cudaStatus = cudaMalloc((void**)&dev_Fringe_Pattern, Fringe_height*Fringe_width*sizeof(float));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
cudaStatus = cudaMemcpy(dev_Image, Image, height*width*sizeof(float), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
testKernel<<<1, 1>>>(dev_Image, dev_Fringe_Pattern);
cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "testKernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
goto Error;
}
cudaStatus = cudaDeviceSynchronize();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
goto Error;
}
cudaStatus = cudaMemcpy(Fringe_Pattern, dev_Fringe_Pattern, Fringe_height*Fringe_width*sizeof(float), cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
Error:
cudaFree(dev_Image);
cudaFree(dev_Fringe_Pattern);
return cudaStatus;
}