Ajout de la correction du TD2

7ae17486 · Mickaël Boichot · abd2dd1f · 7ae17486 · 7ae17486 · 7ae17486
--- a/TDs/TD2/CORRECTION/Makefile
+++ b/TDs/TD2/CORRECTION/Makefile
@@ -5,14 +5,13 @@ CFLAGS=-O3 -Wall
 EXE1=pi.exe
 EXE2=pi_omp.exe
 # EXE3=pi_task.exe
-#EXE3=pi_cuda.exe
-#EXE4=pi_cuda_shared.exe
-#EXE5=pi_cuda_shared_2.exe
-#EXE6=pi_cuda_shared_3.exe
-#EXE7=pi_cuda_shared_4.exe
+EXE3=pi_cuda.exe
+EXE4=pi_cuda_shared.exe
+EXE5=pi_cuda_shared_2.exe
+EXE6=pi_cuda_shared_3.exe
+EXE7=pi_cuda_shared_4.exe

-all : $(EXE1) $(EXE2) 
-#$(EXE3) $(EXE4) $(EXE5) $(EXE6) $(EXE7) 
+all : $(EXE1) $(EXE2) $(EXE3) $(EXE4) $(EXE5) $(EXE6) $(EXE7) 

 $(EXE1) : pi_sequentiel.o
 	$(CC) $(CFLAGS) -o $@ $<
@@ -20,9 +19,6 @@ $(EXE1) : pi_sequentiel.o
 $(EXE2) : pi_omp.o
 	$(CC) $(CFLAGS_OMP) -o $@ $<

-# $(EXE3) : pi_task.o
-#   $(CC) $(CFLAGS_OMP) -o $@ $<
-
 $(EXE3) : pi_cuda.cu
 	$(CUDA_CC) -O3 -o $@ $<


--- a/TDs/TD2/CORRECTION/pi_cuda.cu
+++ b/TDs/TD2/CORRECTION/pi_cuda.cu
+#include <stdlib.h>
+#include <stdio.h>
+#include <cuda.h>
+#include <math.h>
+#include <time.h>
+#include <curand_kernel.h>
+
+#define gettime(t) clock_gettime(CLOCK_MONOTONIC_RAW, t)
+#define get_sub_seconde(t) (1e-9*(double)t.tv_nsec)
+/** return time in second
+*/
+double get_elapsedtime(void)
+{
+  struct timespec st;
+  int err = gettime(&st);
+  if (err !=0) return 0;
+  return (double)st.tv_sec + get_sub_seconde(st);
+}
+
+/* QUESTION 3 */
+#define TRIALS_PER_THREAD 4096
+#define BLOCKS 512
+#define THREADS 256
+/* FIN QUESTION 3*/
+
+/* QUESTION 6 */
+__global__ void gpu_monte_carlo(float *estimate, curandState *states) {
+  unsigned int tid = threadIdx.x + blockDim.x * blockIdx.x;
+  int points_in_circle = 0;
+  float x = 0., y = 0.;
+
+  curand_init(2020, tid, 0, &states[tid]);  //   Initialize CURAND
+
+  for(int i = 0; i < TRIALS_PER_THREAD; i++)
+  {
+    x = curand_uniform (&states[tid]);
+    y = curand_uniform (&states[tid]);
+    points_in_circle += (x*x + y*y <= 1.0f); // count if x & y is in the circle.
+  }
+  estimate[tid] = 4.0f * points_in_circle / (float) TRIALS_PER_THREAD; // return estimate of pi
+}
+/* FIN QUESTION 6 */
+
+int main (int argc, char *argv[]) {
+  float h_counts[BLOCKS * THREADS] = { 0 };
+  double t0 = 0., t1 = 0., duration = 0.;
+
+  printf("# of trials per thread = %d, # of blocks = %d, # of threads/block = %d.\n", TRIALS_PER_THREAD, BLOCKS, THREADS);
+
+/* QUESTION 4 */
+  float *d_counts;
+  cudaMalloc((void **) &d_counts, BLOCKS * THREADS * sizeof(float)); // allocate device mem. for counts
+/* FIN QUESTION 4 */
+
+/* QUESTION 6 */
+  curandState *d_states;
+  cudaMalloc( (void **)&d_states, THREADS * BLOCKS * sizeof(curandState) );
+/* FIN QUESTION 6 */
+
+  t0 = get_elapsedtime();
+/* QUESTION 3 */
+  gpu_monte_carlo<<<BLOCKS, THREADS>>>(d_counts, d_states);
+/* FIN QUESTION 3*/
+
+/* QUESTION 5 */
+  cudaMemcpy(h_counts, d_counts, BLOCKS * THREADS * sizeof(float), cudaMemcpyDeviceToHost); // return results 
+/* FIN QUESTION 5 */
+
+  float pi_gpu = 0.f;
+  for(int i = 0; i < BLOCKS * THREADS; i++)
+  {
+    pi_gpu += h_counts[i];
+  }
+
+  pi_gpu /= (BLOCKS * THREADS);
+
+  t1 = get_elapsedtime();
+  duration = (t1 - t0);
+
+  printf("GPU pi calculated in %lf s.\n", duration);
+  fprintf(stdout, "Pi ~= %lf\n", pi_gpu);
+
+  return 0;
+}
--- a/TDs/TD2/CORRECTION/pi_cuda_shared.cu
+++ b/TDs/TD2/CORRECTION/pi_cuda_shared.cu
+#include <stdlib.h>
+#include <stdio.h>
+#include <cuda.h>
+#include <math.h>
+#include <time.h>
+#include <curand_kernel.h>
+
+#define gettime(t) clock_gettime(CLOCK_MONOTONIC_RAW, t)
+#define get_sub_seconde(t) (1e-9*(double)t.tv_nsec)
+/** return time in second
+*/
+double get_elapsedtime(void)
+{
+  struct timespec st;
+  int err = gettime(&st);
+  if (err !=0) return 0;
+  return (double)st.tv_sec + get_sub_seconde(st);
+}
+
+/* QUESTION 3 */
+#define TRIALS_PER_THREAD 4096
+#define BLOCKS 512
+#define THREADS 256
+/* FIN QUESTION 3*/
+
+/* QUESTION 6 */
+__global__ void gpu_monte_carlo(float *estimate) {
+  unsigned int gtid = threadIdx.x + blockDim.x * blockIdx.x;
+  unsigned int tid = threadIdx.x;
+  int points_in_circle = 0;
+  float x = 0., y = 0.;
+  __shared__ float estimate_s[THREADS];
+  __shared__ curandState states_s[THREADS];
+
+  curand_init(2020, gtid, 0, &states_s[tid]);  //   Initialize CURAND
+
+  for(int i = 0; i < TRIALS_PER_THREAD; i++)
+  {
+    x = curand_uniform (&states_s[tid]);
+    y = curand_uniform (&states_s[tid]);
+    points_in_circle += (x*x + y*y <= 1.0f); // count if x & y is in the circle.
+  }
+  estimate_s[tid] = 4.0f * points_in_circle / (float) TRIALS_PER_THREAD; // return estimate of pi
+  __syncthreads();
+
+  for (unsigned int s=1; s < THREADS; s*=2) {
+    if (tid % (2*s) == 0)
+      estimate_s[tid] += estimate_s[tid + s];
+    __syncthreads();
+  }
+
+  if (tid == 0)
+    estimate[blockIdx.x] = estimate_s[0]; 
+}
+/* FIN QUESTION 6 */
+
+int main (int argc, char *argv[]) {
+  float h_counts[BLOCKS * THREADS] = { 0 };
+  double t0 = 0., t1 = 0., duration = 0.;
+
+  printf("# of trials per thread = %d, # of blocks = %d, # of threads/block = %d.\n", TRIALS_PER_THREAD, BLOCKS, THREADS);
+
+/* QUESTION 4 */
+  float *d_counts;
+  cudaMalloc((void **) &d_counts, BLOCKS * sizeof(float)); // allocate device mem. for counts
+/* FIN QUESTION 4 */
+
+  t0 = get_elapsedtime();
+/* QUESTION 3 */
+  gpu_monte_carlo<<<BLOCKS, THREADS>>>(d_counts);
+/* FIN QUESTION 3*/
+
+/* QUESTION 5 */
+  cudaMemcpy(h_counts, d_counts, BLOCKS  * sizeof(float), cudaMemcpyDeviceToHost); // return results 
+/* FIN QUESTION 5 */
+
+  float pi_gpu = 0.f;
+  for(int i = 0; i < BLOCKS; i++)
+  {
+    pi_gpu += h_counts[i];
+  }
+
+  pi_gpu /= BLOCKS * THREADS;
+
+  t1 = get_elapsedtime();
+  duration = (t1 - t0);
+
+  printf("GPU pi calculated in %lf s.\n", duration);
+  fprintf(stdout, "Pi ~= %lf\n", pi_gpu);
+
+  return 0;
+}
--- a/TDs/TD2/CORRECTION/pi_cuda_shared_2.cu
+++ b/TDs/TD2/CORRECTION/pi_cuda_shared_2.cu
+#include <stdlib.h>
+#include <stdio.h>
+#include <cuda.h>
+#include <math.h>
+#include <time.h>
+#include <curand_kernel.h>
+
+#define gettime(t) clock_gettime(CLOCK_MONOTONIC_RAW, t)
+#define get_sub_seconde(t) (1e-9*(double)t.tv_nsec)
+/** return time in second
+*/
+double get_elapsedtime(void)
+{
+  struct timespec st;
+  int err = gettime(&st);
+  if (err !=0) return 0;
+  return (double)st.tv_sec + get_sub_seconde(st);
+}
+
+/* QUESTION 3 */
+#define TRIALS_PER_THREAD 4096
+#define BLOCKS 512
+#define THREADS 256
+/* FIN QUESTION 3*/
+
+/* QUESTION 6 */
+__global__ void gpu_monte_carlo(float *estimate) {
+  unsigned int gtid = threadIdx.x + blockDim.x * blockIdx.x;
+  unsigned int tid = threadIdx.x;
+  int points_in_circle = 0;
+  float x = 0., y = 0.;
+  __shared__ float estimate_s[THREADS];
+  __shared__ curandState states_s[THREADS];
+
+  curand_init(2020, gtid, 0, &states_s[tid]);  //   Initialize CURAND
+
+  for(int i = 0; i < TRIALS_PER_THREAD; i++)
+  {
+    x = curand_uniform (&states_s[tid]);
+    y = curand_uniform (&states_s[tid]);
+    points_in_circle += (x*x + y*y <= 1.0f); // count if x & y is in the circle.
+  }
+  estimate_s[tid] = 4.0f * points_in_circle / (float) TRIALS_PER_THREAD; // return estimate of pi
+  __syncthreads();
+
+  for (unsigned int s=1; s < THREADS; s*=2) {
+    int index = 2 * s * tid;
+    if (index < THREADS)
+      estimate_s[tid] += estimate_s[tid + s];
+    __syncthreads();
+  }
+
+  if (tid == 0)
+    estimate[blockIdx.x] = estimate_s[0]; 
+}
+/* FIN QUESTION 6 */
+
+int main (int argc, char *argv[]) {
+  float h_counts[BLOCKS * THREADS] = { 0 };
+  double t0 = 0., t1 = 0., duration = 0.;
+
+  printf("# of trials per thread = %d, # of blocks = %d, # of threads/block = %d.\n", TRIALS_PER_THREAD, BLOCKS, THREADS);
+
+/* QUESTION 4 */
+  float *d_counts;
+  cudaMalloc((void **) &d_counts, BLOCKS * sizeof(float)); // allocate device mem. for counts
+/* FIN QUESTION 4 */
+
+  t0 = get_elapsedtime();
+/* QUESTION 3 */
+  gpu_monte_carlo<<<BLOCKS, THREADS>>>(d_counts);
+/* FIN QUESTION 3*/
+
+/* QUESTION 5 */
+  cudaMemcpy(h_counts, d_counts, BLOCKS  * sizeof(float), cudaMemcpyDeviceToHost); // return results 
+/* FIN QUESTION 5 */
+
+  float pi_gpu = 0.f;
+  for(int i = 0; i < BLOCKS; i++)
+  {
+    pi_gpu += h_counts[i];
+  }
+
+  pi_gpu /= BLOCKS * THREADS;
+
+  t1 = get_elapsedtime();
+  duration = (t1 - t0);
+
+  printf("GPU pi calculated in %lf s.\n", duration);
+  fprintf(stdout, "Pi ~= %lf\n", pi_gpu);
+
+  return 0;
+}
--- a/TDs/TD2/CORRECTION/pi_cuda_shared_3.cu
+++ b/TDs/TD2/CORRECTION/pi_cuda_shared_3.cu
+#include <stdlib.h>
+#include <stdio.h>
+#include <cuda.h>
+#include <math.h>
+#include <time.h>
+#include <curand_kernel.h>
+
+#define gettime(t) clock_gettime(CLOCK_MONOTONIC_RAW, t)
+#define get_sub_seconde(t) (1e-9*(double)t.tv_nsec)
+/** return time in second
+*/
+double get_elapsedtime(void)
+{
+  struct timespec st;
+  int err = gettime(&st);
+  if (err !=0) return 0;
+  return (double)st.tv_sec + get_sub_seconde(st);
+}
+
+/* QUESTION 3 */
+#define TRIALS_PER_THREAD 4096
+#define BLOCKS 512
+#define THREADS 256
+/* FIN QUESTION 3*/
+
+/* QUESTION 6 */
+__global__ void gpu_monte_carlo(float *estimate) {
+  unsigned int gtid = threadIdx.x + blockDim.x * blockIdx.x;
+  unsigned int tid = threadIdx.x;
+  int points_in_circle = 0;
+  float x = 0., y = 0.;
+  __shared__ float estimate_s[THREADS];
+  __shared__ curandState states_s[THREADS];
+
+  curand_init(2020, gtid, 0, &states_s[tid]);  //   Initialize CURAND
+
+  for(int i = 0; i < TRIALS_PER_THREAD; i++)
+  {
+    x = curand_uniform (&states_s[tid]);
+    y = curand_uniform (&states_s[tid]);
+    points_in_circle += (x*x + y*y <= 1.0f); // count if x & y is in the circle.
+  }
+  estimate_s[tid] = 4.0f * points_in_circle / (float) TRIALS_PER_THREAD; // return estimate of pi
+  __syncthreads();
+
+  for (unsigned int s=THREADS/2; s>0; s>>=1){
+    if (tid < s)
+      estimate_s[tid] += estimate_s[tid + s];
+    __syncthreads();
+  }
+
+  if (tid == 0)
+    estimate[blockIdx.x] = estimate_s[0]; 
+}
+/* FIN QUESTION 6 */
+
+int main (int argc, char *argv[]) {
+  float h_counts[BLOCKS * THREADS] = { 0 };
+  double t0 = 0., t1 = 0., duration = 0.;
+
+  printf("# of trials per thread = %d, # of blocks = %d, # of threads/block = %d.\n", TRIALS_PER_THREAD, BLOCKS, THREADS);
+
+/* QUESTION 4 */
+  float *d_counts;
+  cudaMalloc((void **) &d_counts, BLOCKS * sizeof(float)); // allocate device mem. for counts
+/* FIN QUESTION 4 */
+
+  t0 = get_elapsedtime();
+/* QUESTION 3 */
+  gpu_monte_carlo<<<BLOCKS, THREADS>>>(d_counts);
+/* FIN QUESTION 3*/
+
+/* QUESTION 5 */
+  cudaMemcpy(h_counts, d_counts, BLOCKS  * sizeof(float), cudaMemcpyDeviceToHost); // return results 
+/* FIN QUESTION 5 */
+
+  float pi_gpu = 0.f;
+  for(int i = 0; i < BLOCKS; i++)
+  {
+    pi_gpu += h_counts[i];
+  }
+
+  pi_gpu /= BLOCKS * THREADS;
+
+  t1 = get_elapsedtime();
+  duration = (t1 - t0);
+
+  printf("GPU pi calculated in %lf s.\n", duration);
+  fprintf(stdout, "Pi ~= %lf\n", pi_gpu);
+
+  return 0;
+}
--- a/TDs/TD2/CORRECTION/pi_cuda_shared_4.cu
+++ b/TDs/TD2/CORRECTION/pi_cuda_shared_4.cu
+#include <stdlib.h>
+#include <stdio.h>
+#include <cuda.h>
+#include <math.h>
+#include <time.h>
+#include <curand_kernel.h>
+
+#define gettime(t) clock_gettime(CLOCK_MONOTONIC_RAW, t)
+#define get_sub_seconde(t) (1e-9*(double)t.tv_nsec)
+/** return time in second
+*/
+double get_elapsedtime(void)
+{
+  struct timespec st;
+  int err = gettime(&st);
+  if (err !=0) return 0;
+  return (double)st.tv_sec + get_sub_seconde(st);
+}
+
+/* QUESTION 3 */
+#define TRIALS_PER_THREAD 4096
+#define BLOCKS 512
+#define THREADS 256
+/* FIN QUESTION 3*/
+
+/* QUESTION 6 */
+__global__ void gpu_monte_carlo(float *estimate) {
+  unsigned int gtid = threadIdx.x + blockDim.x * blockIdx.x;
+  unsigned int tid = threadIdx.x;
+  int points_in_circle = 0;
+  float x = 0., y = 0.;
+  __shared__ float estimate_s[THREADS];
+  __shared__ curandState states_s[THREADS];
+
+  curand_init(2020, gtid, 0, &states_s[tid]);  //   Initialize CURAND
+
+  for(int i = 0; i < TRIALS_PER_THREAD; i++)
+  {
+    x = curand_uniform (&states_s[tid]);
+    y = curand_uniform (&states_s[tid]);
+    points_in_circle += (x*x + y*y <= 1.0f); // count if x & y is in the circle.
+  }
+  estimate_s[tid] = 4.0f * points_in_circle / (float) TRIALS_PER_THREAD; // return estimate of pi
+  __syncthreads();
+
+  for (unsigned int s=THREADS/2; s > 0; s>>=1) {
+    if (tid < s)
+      estimate_s[tid] += estimate_s[tid + s];
+    __syncthreads();
+  }
+
+  if (tid == 0)
+    atomicAdd(estimate, estimate_s[0]);
+}
+/* FIN QUESTION 6 */
+
+int main (int argc, char *argv[]) {
+  float h_counts = 0;
+  double t0 = 0., t1 = 0., duration = 0.;
+
+  printf("# of trials per thread = %d, # of blocks = %d, # of threads/block = %d.\n", TRIALS_PER_THREAD, BLOCKS, THREADS);
+
+/* QUESTION 4 */
+  float *d_counts;
+  cudaMalloc((void **) &d_counts, sizeof(float)); // allocate device mem. for counts
+/* FIN QUESTION 4 */
+
+  t0 = get_elapsedtime();
+/* QUESTION 3 */
+  gpu_monte_carlo<<<BLOCKS, THREADS>>>(d_counts);
+/* FIN QUESTION 3*/
+
+/* QUESTION 5 */
+  cudaMemcpy(&h_counts, d_counts, sizeof(float), cudaMemcpyDeviceToHost); // return results 
+/* FIN QUESTION 5 */
+
+  float pi_gpu = h_counts / (BLOCKS * THREADS);
+
+  t1 = get_elapsedtime();
+  duration = (t1 - t0);
+
+  printf("GPU pi calculated in %lf s.\n", duration);
+  fprintf(stdout, "Pi ~= %lf\n", pi_gpu);
+
+  return 0;
+}