diff --git a/TDs/TD2/CODE/Makefile b/TDs/TD2/CODE/Makefile index e564ae5268bea89dddfd42f283a6abc8993459ba..71214f90b3caa5cded767965b5692600ceee9821 100755 --- a/TDs/TD2/CODE/Makefile +++ b/TDs/TD2/CODE/Makefile @@ -1,10 +1,13 @@ CC=gcc +NVCC=nvcc CFLAGS_OMP=-O3 -Wall -fopenmp -DUSE_OMP CFLAGS=-O3 -Wall +NVFLAGS= EXE1=pi.exe EXE2=pi_omp.exe +EXE3=pi_cuda.exe -all : $(EXE1) $(EXE2) +all : $(EXE1) $(EXE2) $(EXE3) $(EXE1) : pi.o $(CC) $(CFLAGS) -o $@ $< @@ -12,14 +15,20 @@ $(EXE1) : pi.o $(EXE2) : pi_omp.o $(CC) $(CFLAGS_OMP) -o $@ $< +$(EXE3) : pi_cuda.o + $(NVCC) $(NVFLAGS) -o $@ $< + %_omp.o : %_omp.c $(CC) $(CFLAGS_OMP) -c -o $@ $< +%_cuda.o : %_cuda.cu + $(NVCC) $(NVFLAGS) -c -o $@ $< + %.o : %.c $(CC) $(CFLAGS) -c -o $@ $< clean : - rm -f *.o $(EXE1) $(EXE2) + rm -f *.o $(EXE1) $(EXE2) $(EXE3) proper : rm -f *.o diff --git a/TDs/TD2/CODE/helper_cuda.h b/TDs/TD2/CODE/helper_cuda.h new file mode 100644 index 0000000000000000000000000000000000000000..3dd446dced85de9d10eb9f1597c56099fa358bdb --- /dev/null +++ b/TDs/TD2/CODE/helper_cuda.h @@ -0,0 +1,183 @@ +/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of NVIDIA CORPORATION nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +//////////////////////////////////////////////////////////////////////////////// +// These are CUDA Helper functions for initialization and error checking + +#ifndef COMMON_HELPER_CUDA_H_ +#define COMMON_HELPER_CUDA_H_ + +#pragma once + +#include <stdint.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +#ifndef MAX +#define MAX(a, b) (a > b ? a : b) +#endif + +#ifndef MIN +#define MIN(a, b) (a < b ? a : b) +#endif + +#define checkCudaErrors(val) check((val), #val, __FILE__, __LINE__) +#define getLastCudaError(msg) __getLastCudaError(msg, __FILE__, __LINE__) + +void check(cudaError_t result, char const *const func, const char *const file, + int const line) +{ + if (result) + { + fprintf(stderr, "CUDA error at %s:%d code=%d (%s) \"%s\" \n", file, line, (int)result, cudaGetErrorName(result), func); + exit(EXIT_FAILURE); + } +} + +inline void __getLastCudaError(const char *errorMessage, const char *file, + const int line) +{ + cudaError_t err = cudaGetLastError(); + + if (cudaSuccess != err) + { + fprintf(stderr, + "%s(%i) : getLastCudaError() CUDA error :" + " %s : (%d) %s.\n", + file, line, errorMessage, (int)(err), + cudaGetErrorString(err)); + exit(EXIT_FAILURE); + } +} + +// Beginning of GPU Architecture definitions +int _ConvertSMVer2Cores(int major, int minor) +{ + // Defines for GPU Architecture types (using the SM version to determine + // the # of cores per SM + typedef struct + { + int SM; // 0xMm (hexidecimal notation), M = SM Major version, + // and m = SM minor version + int Cores; + } sSMtoCores; + + sSMtoCores nGpuArchCoresPerSM[] = { + {0x30, 192}, + {0x32, 192}, + {0x35, 192}, + {0x37, 192}, + {0x50, 128}, + {0x52, 128}, + {0x53, 128}, + {0x60, 64}, + {0x61, 128}, + {0x62, 128}, + {0x70, 64}, + {0x72, 64}, + {0x75, 64}, + {0x80, 64}, + {0x86, 128}, + {0x87, 128}, + {-1, -1}}; + + int index = 0; + + while (nGpuArchCoresPerSM[index].SM != -1) + { + if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor)) + { + return nGpuArchCoresPerSM[index].Cores; + } + + index++; + } + + // If we don't find the values, we default use the previous one + // to run properly + printf( + "MapSMtoCores for SM %d.%d is undefined." + " Default to use %d Cores/SM\n", + major, minor, nGpuArchCoresPerSM[index - 1].Cores); + return nGpuArchCoresPerSM[index - 1].Cores; +} + +inline const char *_ConvertSMVer2ArchName(int major, int minor) +{ + // Defines for GPU Architecture types (using the SM version to determine + // the GPU Arch name) + typedef struct + { + int SM; // 0xMm (hexidecimal notation), M = SM Major version, + // and m = SM minor version + const char *name; + } sSMtoArchName; + + sSMtoArchName nGpuArchNameSM[] = { + {0x30, "Kepler"}, + {0x32, "Kepler"}, + {0x35, "Kepler"}, + {0x37, "Kepler"}, + {0x50, "Maxwell"}, + {0x52, "Maxwell"}, + {0x53, "Maxwell"}, + {0x60, "Pascal"}, + {0x61, "Pascal"}, + {0x62, "Pascal"}, + {0x70, "Volta"}, + {0x72, "Xavier"}, + {0x75, "Turing"}, + {0x80, "Ampere"}, + {0x86, "Ampere"}, + {-1, "Graphics Device"}}; + + int index = 0; + + while (nGpuArchNameSM[index].SM != -1) + { + if (nGpuArchNameSM[index].SM == ((major << 4) + minor)) + { + return nGpuArchNameSM[index].name; + } + + index++; + } + + // If we don't find the values, we default use the previous one + // to run properly + printf( + "MapSMtoArchName for SM %d.%d is undefined." + " Default to use %s\n", + major, minor, nGpuArchNameSM[index - 1].name); + return nGpuArchNameSM[index - 1].name; +} +// end of GPU Architecture definitions + +// end of CUDA Helper Functions + +#endif // COMMON_HELPER_CUDA_H_ \ No newline at end of file diff --git a/TDs/TD2/CODE/pi.c b/TDs/TD2/CODE/pi.c index 4d0d7e23180eae12cca0fe6a2ba5cdcd94e1632f..5d4db5ca7464a5036518bf005de907ba57ad2ea6 100755 --- a/TDs/TD2/CODE/pi.c +++ b/TDs/TD2/CODE/pi.c @@ -2,20 +2,29 @@ #include <math.h> #include <stdio.h> #include <stdlib.h> +#include <time.h> int main(int argc, char** argv) { - uint64_t n_test = 10E7; - uint64_t i; - uint64_t count = 0; - double x = 0., y = 0.; - double pi = 0.; + uint64_t n_test = 10E7; + uint64_t i; + uint64_t count = 0; + double x = 0., y = 0.; + double pi = 0.; - // TODO: inialisation du generateur de nombres pseudo aleatoires. - // TODO: tirageS de flechettes + // inialisation du generateur de nombres pseudo aleatoires. + srand(time(NULL)); - fprintf(stdout, "%ld of %ld throws are in the circle !\n", count, n_test); - // TODO: estimation de Pi - fprintf(stdout, "Pi ~= %lf\n", pi); + // tirageS de flechettes + for(i = 0; i < n_test; ++i){ + x = (double) rand() / RAND_MAX; + y = (double) rand() / RAND_MAX; + count += (x * x + y * y < 1.); + } - return 0; + fprintf(stdout, "%ld of %ld throws are in the circle !\n", count, n_test); + // estimation de Pi + pi = (count / ((double) n_test)) * 4.; + fprintf(stdout, "Pi ~= %lf\n", pi); + + return 0; } diff --git a/TDs/TD2/CODE/pi_cuda.cu b/TDs/TD2/CODE/pi_cuda.cu new file mode 100644 index 0000000000000000000000000000000000000000..b3047865d569b9f024663430c945596453b2c234 --- /dev/null +++ b/TDs/TD2/CODE/pi_cuda.cu @@ -0,0 +1,78 @@ +#include <inttypes.h> +#include <math.h> +#include <stdio.h> +#include <stdlib.h> +#include <time.h> + +#include <curand_kernel.h> + +#include "helper_cuda.h" + +#define N_TEST 10E7 +#define GRID_SIZE_X 512 +#define BLOCK_SIZE_X 256 +#define TAB_SIZE (GRID_SIZE_X * BLOCK_SIZE_X) +#define TRIALS_PER_THREAD (N_TEST / TAB_SIZE) + +__global__ void pi_kernel(double *d_pi){ + const int tid = blockIdx.x * blockDim.x + threadIdx.x; + + double n_test = 10E7; + int grid_size_x = 512; + int block_size_x = 256; + int tab_size = grid_size_x * block_size_x; + int trials_per_thread = (int) n_test / tab_size; + + curandState localState; + curand_init(0, tid, 0, &localState); + int count; + float x, y; + for(size_t i = 0; i < trials_per_thread; ++i){ + x = curand_uniform(&localState); + y = curand_uniform(&localState); + count += x * x + y * y < 1; + } + d_pi[tid] = 4. * (double) count / (double) trials_per_thread; +} + +int main(int argc, char** argv) { + double n_test = 10E7; + int grid_size_x = 512; + int block_size_x = 256; + int tab_size = grid_size_x * block_size_x; + int trials_per_thread = n_test / tab_size; + + uint64_t i; + double pi = 0.; + + int sz_in_bytes = sizeof(double) * tab_size; + + double *h_pi; + double *d_pi; + + h_pi = (double *) malloc(sz_in_bytes); + + checkCudaErrors(cudaMalloc((void **) &d_pi, sz_in_bytes)); + checkCudaErrors(cudaMemset(d_pi, 0, sz_in_bytes)); + + dim3 dimBlock(block_size_x, 1, 1); + dim3 dimGrid(grid_size_x, 1, 1); + pi_kernel<<<dimGrid, dimBlock>>>(d_pi); + cudaDeviceSynchronize(); + getLastCudaError("PI kernel failed"); + + checkCudaErrors(cudaMemcpy(h_pi, d_pi, sz_in_bytes, cudaMemcpyDeviceToHost)); + + fprintf(stdout, "Pi ~= %lf\n", h_pi[0]); + + for(i = 0; i < tab_size; ++i) + pi += h_pi[i]; + pi /= (double) tab_size; + + fprintf(stdout, "Pi ~= %lf\n", pi); + + checkCudaErrors(cudaFree(d_pi)); + free(h_pi); + + return 0; +} diff --git a/TDs/TD2/CODE/pi_cuda.out b/TDs/TD2/CODE/pi_cuda.out new file mode 100644 index 0000000000000000000000000000000000000000..2a6648b3926c0dd99c95eb455c4258366ff6e5cc --- /dev/null +++ b/TDs/TD2/CODE/pi_cuda.out @@ -0,0 +1,14 @@ +⯠make +nvcc -c -o pi_cuda.o pi_cuda.cu +nvcc -o pi_cuda.exe pi_cuda.o +⯠./pi_cuda.exe +test +Pi ~= 3.128609 +Pi ~= 3.141749 +⯠make +nvcc -c -o pi_cuda.o pi_cuda.cu +nvcc -o pi_cuda.exe pi_cuda.o +⯠./pi_cuda.exe +Pi ~= 6109520.860892 +Pi ~= -7959034.788585 + diff --git a/TDs/TD2/CODE/pi_omp.c b/TDs/TD2/CODE/pi_omp.c index 39fe0cc8cbf8f430db982d6f5ab4c2d208bd8616..7a8ef808f984ccc41a798d29387fcafa6f1c0b7c 100755 --- a/TDs/TD2/CODE/pi_omp.c +++ b/TDs/TD2/CODE/pi_omp.c @@ -7,48 +7,62 @@ #include "omp.h" -#define TRIALS_PER_THREAD 10E10 - #define gettime(t) clock_gettime(CLOCK_MONOTONIC_RAW, t) #define get_sub_seconde(t) (1e-9 * (double)t.tv_nsec) /** return time in second */ double get_elapsedtime(void) { - struct timespec st; - int err = gettime(&st); - if (err != 0) return 0; - return (double)st.tv_sec + get_sub_seconde(st); + struct timespec st; + int err = gettime(&st); + if (err != 0) return 0; + return (double) st.tv_sec + get_sub_seconde(st); } int main(int argc, char** argv) { - uint64_t const n_test = TRIALS_PER_THREAD; - uint64_t i; - double x = 0., y = 0.; - double pi = 0.; - double t0 = 0., t1 = 0., duration = 0.; + uint64_t const n_test = 10E7; + uint64_t i = 0; + double x = 0., y = 0.; + double pi = 0.; + double t0 = 0., t1 = 0., duration = 0.; + + int nb_threads = 0; + uint64_t count = 0; + + t0 = get_elapsedtime(); + + #pragma omp parallel firstprivate(i, x, y) reduction(+:count) + { + #pragma omp master + { + nb_threads = omp_get_num_threads(); + fprintf(stdout, "Nb threads: %d\n", nb_threads); + } + + size_t thread_num = omp_get_thread_num(); - int nb_threads = 0; -#pragma omp parallel shared(nb_threads) -#pragma omp master - nb_threads = omp_get_num_threads(); - fprintf(stdout, "Nb threads: %d\n", nb_threads); + // initialisation du generateur de nombre pseudo aléatoires + unsigned int seed = time(NULL) ^ thread_num; - // TODO: initialisation du tableau stockant le résultat de chaque thread - uint64_t* result = (uint64_t*)malloc(sizeof(uint64_t) * nb_threads); + // tirageS de flechettes + #pragma omp for + for(i = 0; i < n_test; ++i){ + x = (double) rand_r(&seed) / RAND_MAX; + y = (double) rand_r(&seed) / RAND_MAX; + count += (x * x + y * y < 1.); + } + } - // TODO: initialisation du generateur de nombre pseudo aléatoires + t1 = get_elapsedtime(); - t0 = get_elapsedtime(); + duration = (t1 - t0); - // TODO: tirageS de flechettes + fprintf(stdout, + "%ld of %ld throws are in the circle ! (Time: %lf s)\n", + count, n_test, duration); - // TODO: reduction des resultats - t1 = get_elapsedtime(); - duration = (t1 - t0); - fprintf(stdout, "%ld of %ld throws are in the circle ! (Time: %lf s)\n", - (uint64_t)pi, n_test, duration); - // TODO: estimation de Pi - fprintf(stdout, "Pi ~= %lf\n", pi); + // estimation de Pi + pi = (count / ((double) n_test)) * 4.; + fprintf(stdout, "Pi ~= %lf\n", pi); - return 0; + return 0; }