ok sorry about the link, i got some server problems lat night. I will include a very simple example, which ist still slower on the gpu tha cpu. I read the existing docu, but well time to read it again.
Problem:
multiply add, substract, divide two arrays of legth n.
and here is the code.
- test file
http://binbase.fiehnlab.ucdavis.edu:8060/b…ay/arrayTest.cu
#include "arrayCuda.h"
#include "array.h"
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
double timeCpp(int count){
//initialize timer
clock_t start = clock();
//calculate needed time
clock_t end = clock();
float* a = (float*) malloc(sizeof(float)*count);
float* b = (float*) malloc(sizeof(float)*count);
float* c = (float*) malloc(sizeof(float)*count);
for(int i = 0; i < count; i++){
a[i] = i;
b[i] = i;
}
//execute the function
add(a,b,c,count);
free(a);
free(b);
free©;
return ((double)end - start) / CLOCKS_PER_SEC;
}
//time the cuda execution time
double timeCuda(int count){
//initialize timer
clock_t start = clock();
//calculate needed time
clock_t end = clock();
float* a = (float*) malloc(sizeof(float)*count);
float* b = (float*) malloc(sizeof(float)*count);
float* c = (float*) malloc(sizeof(float)*count);
for(int i = 0; i < count; i++){
a[i] = i;
b[i] = i;
}
//execute the function
addCuda(a,b,c,count);
free(a);
free(b);
free©;
return ((double)end - start) / CLOCKS_PER_SEC;
}
// main routine that executes on the host
int main(void) {
int size = 1024 * 1024 * 128/sizeof(float);
printf("count\t\tcuda\t\tc++\n");
printf("%i\t%f\t%f\n",size, timeCuda(size),timeCpp(size));
//return
return EXIT_SUCCESS;
}
- cpp/ version
http://binbase.fiehnlab.ucdavis.edu:8060/b…2B%2B/array.cpp
#include "array.h"
/**
* adds two arrays
*/
void add(float* a, float* b, float* result, int arraySize){
for(int i = 0; i < arraySize; i++){
result[i] = a[i] + b[i];
}
}
- cuda version
http://binbase.fiehnlab.ucdavis.edu:8060/b…da/arrayCuda.cu
#include "arrayCuda.h"
__global__ void add(float *result, float *a, float *b, int size) {
//calculate the index of the current spectra
int idx = blockIdx.x * blockDim.x + threadIdx.x;
//assign the result for the given calculation
if (idx<size) {
result[idx] = a[idx] + b[idx];
}
}
/**
* adds two arrays
*/
void addCuda(float* a, float* b, float* result, int arraySize) {
//size of needed arrays
const int floatSize = arraySize * sizeof(float);
//device variables
float* deviceResult;
float* deviceA;
float* deviceB;
//assign memory on the device
cudaMalloc((void **) &deviceResult, floatSize);
cudaMalloc((void **) &deviceA, floatSize);
cudaMalloc((void **) &deviceB, floatSize);
//copy to the device
cudaMemcpy(deviceA, a, floatSize,
cudaMemcpyHostToDevice);
cudaMemcpy(deviceB, b, floatSize,
cudaMemcpyHostToDevice);
cudaMemcpy(deviceResult, result, floatSize,
cudaMemcpyHostToDevice);
//calculate the block sizes
int block_size = 256;
int n_blocks = arraySize/block_size + (arraySize%block_size
== 0 ? 0 : 1);
//do the operation
add<<< n_blocks, block_size >>>(deviceResult,deviceA,deviceB,arraySize);
//copy the result back into the memory
cudaMemcpy(result, deviceResult, floatSize,cudaMemcpyDeviceToHost);
//free up the memory
cudaFree(deviceResult);
cudaFree(deviceA);
cudaFree(deviceB);
return;
}
so I know its a very simple procedure, but do I miss something somewhere?