diff --git a/TDs/TD2/CODE/pi_cuda.cu b/TDs/TD2/CODE/pi_cuda.cu index b3047865d569b9f024663430c945596453b2c234..a18fbabe4029d9cd5a9ecb8bee97e8f27df432bd 100644 --- a/TDs/TD2/CODE/pi_cuda.cu +++ b/TDs/TD2/CODE/pi_cuda.cu @@ -17,35 +17,23 @@ __global__ void pi_kernel(double *d_pi){ const int tid = blockIdx.x * blockDim.x + threadIdx.x; - double n_test = 10E7; - int grid_size_x = 512; - int block_size_x = 256; - int tab_size = grid_size_x * block_size_x; - int trials_per_thread = (int) n_test / tab_size; - curandState localState; curand_init(0, tid, 0, &localState); - int count; + int count = 0; float x, y; - for(size_t i = 0; i < trials_per_thread; ++i){ + for(size_t i = 0; i < TRIALS_PER_THREAD; ++i){ x = curand_uniform(&localState); y = curand_uniform(&localState); count += x * x + y * y < 1; } - d_pi[tid] = 4. * (double) count / (double) trials_per_thread; + d_pi[tid] = 4. * (double) count / (double) TRIALS_PER_THREAD; } int main(int argc, char** argv) { - double n_test = 10E7; - int grid_size_x = 512; - int block_size_x = 256; - int tab_size = grid_size_x * block_size_x; - int trials_per_thread = n_test / tab_size; - uint64_t i; - double pi = 0.; + double pi; - int sz_in_bytes = sizeof(double) * tab_size; + int sz_in_bytes = sizeof(double) * TAB_SIZE; double *h_pi; double *d_pi; @@ -55,19 +43,18 @@ int main(int argc, char** argv) { checkCudaErrors(cudaMalloc((void **) &d_pi, sz_in_bytes)); checkCudaErrors(cudaMemset(d_pi, 0, sz_in_bytes)); - dim3 dimBlock(block_size_x, 1, 1); - dim3 dimGrid(grid_size_x, 1, 1); + dim3 dimBlock(BLOCK_SIZE_X, 1, 1); + dim3 dimGrid(GRID_SIZE_X, 1, 1); pi_kernel<<<dimGrid, dimBlock>>>(d_pi); cudaDeviceSynchronize(); getLastCudaError("PI kernel failed"); checkCudaErrors(cudaMemcpy(h_pi, d_pi, sz_in_bytes, cudaMemcpyDeviceToHost)); - fprintf(stdout, "Pi ~= %lf\n", h_pi[0]); - - for(i = 0; i < tab_size; ++i) + pi = 0.; + for(i = 0; i < TAB_SIZE; ++i) pi += h_pi[i]; - pi /= (double) tab_size; + pi /= (double) TAB_SIZE; fprintf(stdout, "Pi ~= %lf\n", pi);