diff --git a/Cours/APM_Cours2.pdf b/Cours/APM_Cours2.pdf
index c118429dc01c63ebb0dfe621ae3cf8eb1f060447..ff476756eed647ff381ab38f753a8bba80a19560 100644
Binary files a/Cours/APM_Cours2.pdf and b/Cours/APM_Cours2.pdf differ
diff --git a/TDs/TD1/CORRECTION/Partie2/td1_p2.c b/TDs/TD1/CORRECTION/Partie2/td1_p2.c
new file mode 100644
index 0000000000000000000000000000000000000000..2c935b1b9407042fa0d9f083a5bc696447728392
--- /dev/null
+++ b/TDs/TD1/CORRECTION/Partie2/td1_p2.c
@@ -0,0 +1,86 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <string.h>
+
+
+void kernel(double *a, double *b, double *c, int N, int dimBlock, int blockId, int threadId)
+{
+
+  int i, j, k;
+ 	i = blockId * dimBlock + threadId;
+
+ 	if (i < N)
+ 	{
+ 		c[i] = a[i] + b[i];
+	}
+}
+
+int main(int argc, char **argv)
+{
+    int N = 1000;
+    int sz_in_bytes = N*sizeof(double);
+
+    double *h_a, *h_b, *h_c;
+    double *d_a, *d_b, *d_c;
+
+    h_a = (double*)malloc(sz_in_bytes);
+    h_b = (double*)malloc(sz_in_bytes);
+    h_c = (double*)malloc(sz_in_bytes);
+
+    // Initiate values on h_a and h_b
+    for(int i = 0 ; i < N ; i++)
+    {
+	h_a[i] = 1./(1.+i);
+	h_b[i] = (i-1.)/(i+1.);
+    }
+
+    // 3-arrays allocation on device 
+//    cudaMalloc((void**)&d_a, sz_in_bytes);
+//    cudaMalloc((void**)&d_b, sz_in_bytes);
+//    cudaMalloc((void**)&d_c, sz_in_bytes);
+    d_a = (double*)malloc(sz_in_bytes);
+    d_b = (double*)malloc(sz_in_bytes);
+    d_c = (double*)malloc(sz_in_bytes);
+
+
+    // copy on device values pointed on host by h_a and h_b
+    // (the new values are pointed by d_a et d_b on device)
+
+//    cudaMemcpy(d_a, h_a, sz_in_bytes, cudaMemcpyHostToDevice);
+//    cudaMemcpy(d_b, h_b, sz_in_bytes, cudaMemcpyHostToDevice);
+	memcpy(d_a, h_a, sz_in_bytes);
+	memcpy(d_b, h_b, sz_in_bytes);
+
+    //dim3  dimBlock(64, 1, 1);
+    //dim3  dimGrid((N + dimBlock.x - 1)/dimBlock.x, 1, 1);
+	int dimBlock = (64);
+	int dimGrid = ((N + 64 - 1)/64);
+
+  for(int blockId = 0; blockId < dimGrid; ++blockId)
+  {
+    for(int tid = 0; tid < dimBlock; ++tid)
+    {
+      kernel(d_a, d_b, d_c, N, dimBlock, blockId, tid);
+    }
+  }
+
+    // Result is pointed by d_c on device
+    // Copy this result on host (result pointed by h_c on host)
+//    cudaMemcpy(h_c, d_c, sz_in_bytes, cudaMemcpyDeviceToHost);
+	memcpy(h_c, d_c, sz_in_bytes);
+
+    // freeing on device 
+//    cudaFree(d_a);
+//    cudaFree(d_b);
+//    cudaFree(d_c);
+    free(d_a);
+    free(d_b);
+    free(d_c);
+
+    free(h_a);
+    free(h_b);
+    free(h_c);
+
+    return 0;
+}
diff --git a/TDs/TD1/CORRECTION/Partie2/td2.c b/TDs/TD1/CORRECTION/Partie2/td2.c
new file mode 100644
index 0000000000000000000000000000000000000000..e331b0a3953c0d48b4306d14c782c93ae101dfe0
--- /dev/null
+++ b/TDs/TD1/CORRECTION/Partie2/td2.c
@@ -0,0 +1,86 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+
+
+void kernel(double *a, double *b, double *c, int N, int dimGrid, int dimBlock)
+{
+
+  int i, j, k;
+
+  for(j=0; j<dimGrid; j++)
+  {
+  	for(k=0; k<dimBlock; k++)
+    {
+  		i = j * dimBlock + k;
+  		if (i < N)
+  		{
+  			c[i] = a[i] + b[i];
+  		}
+  	}
+  }
+}
+
+int main(int argc, char **argv)
+{
+    int N = 1000;
+    int sz_in_bytes = N*sizeof(double);
+
+    double *h_a, *h_b, *h_c;
+    double *d_a, *d_b, *d_c;
+
+    h_a = (double*)malloc(sz_in_bytes);
+    h_b = (double*)malloc(sz_in_bytes);
+    h_c = (double*)malloc(sz_in_bytes);
+
+    // Initiate values on h_a and h_b
+    for(int i = 0 ; i < N ; i++)
+    {
+	h_a[i] = 1./(1.+i);
+	h_b[i] = (i-1.)/(i+1.);
+    }
+
+    // 3-arrays allocation on device 
+//    cudaMalloc((void**)&d_a, sz_in_bytes);
+//    cudaMalloc((void**)&d_b, sz_in_bytes);
+//    cudaMalloc((void**)&d_c, sz_in_bytes);
+    d_a = (double*)malloc(sz_in_bytes);
+    d_b = (double*)malloc(sz_in_bytes);
+    d_c = (double*)malloc(sz_in_bytes);
+
+
+    // copy on device values pointed on host by h_a and h_b
+    // (the new values are pointed by d_a et d_b on device)
+
+//    cudaMemcpy(d_a, h_a, sz_in_bytes, cudaMemcpyHostToDevice);
+//    cudaMemcpy(d_b, h_b, sz_in_bytes, cudaMemcpyHostToDevice);
+	memcpy(d_a, h_a, sz_in_bytes);
+	memcpy(d_b, h_b, sz_in_bytes);
+
+    dim3  dimBlock(64, 1, 1);
+    dim3  dimGrid((N + dimBlock.x - 1)/dimBlock.x, 1, 1);
+	int dimBlock = (64);
+	int dimGrid = ((N + 64 - 1)/64);
+
+    kernel(d_a, d_b, d_c, N, dimGrid, dimBlock);
+
+    // Result is pointed by d_c on device
+    // Copy this result on host (result pointed by h_c on host)
+//    cudaMemcpy(h_c, d_c, sz_in_bytes, cudaMemcpyDeviceToHost);
+	memcpy(h_c, d_c, sz_in_bytes);
+
+    // freeing on device 
+//    cudaFree(d_a);
+//    cudaFree(d_b);
+//    cudaFree(d_c);
+    free(d_a);
+    free(d_b);
+    free(d_c);
+
+
+    free(h_a);
+    free(h_b);
+    free(h_c);
+
+    return 0;
+}
diff --git a/TDs/TD1/SUJET/td1.pdf b/TDs/TD1/SUJET/td1.pdf
index 2b625b1e169022ff40e61423f10e8c6349033f7c..c409c6b0c0cc64db65c2a1de90c43fe94f2f4ceb 100644
Binary files a/TDs/TD1/SUJET/td1.pdf and b/TDs/TD1/SUJET/td1.pdf differ
diff --git a/TDs/TD1/SUJET/td1_slide.pdf b/TDs/TD1/SUJET/td1_slide.pdf
new file mode 100644
index 0000000000000000000000000000000000000000..a963410bce4d5231fb29f16344b9d865a35511d0
Binary files /dev/null and b/TDs/TD1/SUJET/td1_slide.pdf differ
diff --git a/TDs/TD2/CORRECTION/Makefile b/TDs/TD2/CORRECTION/Makefile
index b51970c246b010e4eff4e06c30f78f7212bc8cc7..2e07ac4f537854305082a4a968a760f15963edce 100755
--- a/TDs/TD2/CORRECTION/Makefile
+++ b/TDs/TD2/CORRECTION/Makefile
@@ -5,14 +5,13 @@ CFLAGS=-O3 -Wall
 EXE1=pi.exe
 EXE2=pi_omp.exe
 # EXE3=pi_task.exe
-#EXE3=pi_cuda.exe
-#EXE4=pi_cuda_shared.exe
-#EXE5=pi_cuda_shared_2.exe
-#EXE6=pi_cuda_shared_3.exe
-#EXE7=pi_cuda_shared_4.exe
+EXE3=pi_cuda.exe
+EXE4=pi_cuda_shared.exe
+EXE5=pi_cuda_shared_2.exe
+EXE6=pi_cuda_shared_3.exe
+EXE7=pi_cuda_shared_4.exe
 
-all : $(EXE1) $(EXE2) 
-#$(EXE3) $(EXE4) $(EXE5) $(EXE6) $(EXE7) 
+all : $(EXE1) $(EXE2) $(EXE3) $(EXE4) $(EXE5) $(EXE6) $(EXE7) 
 
 $(EXE1) : pi_sequentiel.o
 	$(CC) $(CFLAGS) -o $@ $<
@@ -20,9 +19,6 @@ $(EXE1) : pi_sequentiel.o
 $(EXE2) : pi_omp.o
 	$(CC) $(CFLAGS_OMP) -o $@ $<
 
-# $(EXE3) : pi_task.o
-#   $(CC) $(CFLAGS_OMP) -o $@ $<
-
 $(EXE3) : pi_cuda.cu
 	$(CUDA_CC) -O3 -o $@ $<
 
diff --git a/TDs/TD2/CORRECTION/pi_cuda.cu b/TDs/TD2/CORRECTION/pi_cuda.cu
new file mode 100755
index 0000000000000000000000000000000000000000..7762ce1806913235615124be84680dfd6720b2f5
--- /dev/null
+++ b/TDs/TD2/CORRECTION/pi_cuda.cu
@@ -0,0 +1,84 @@
+#include <stdlib.h>
+#include <stdio.h>
+#include <cuda.h>
+#include <math.h>
+#include <time.h>
+#include <curand_kernel.h>
+
+#define gettime(t) clock_gettime(CLOCK_MONOTONIC_RAW, t)
+#define get_sub_seconde(t) (1e-9*(double)t.tv_nsec)
+/** return time in second
+*/
+double get_elapsedtime(void)
+{
+  struct timespec st;
+  int err = gettime(&st);
+  if (err !=0) return 0;
+  return (double)st.tv_sec + get_sub_seconde(st);
+}
+
+/* QUESTION 3 */
+#define TRIALS_PER_THREAD 4096
+#define BLOCKS 512
+#define THREADS 256
+/* FIN QUESTION 3*/
+
+/* QUESTION 6 */
+__global__ void gpu_monte_carlo(float *estimate, curandState *states) {
+  unsigned int tid = threadIdx.x + blockDim.x * blockIdx.x;
+  int points_in_circle = 0;
+  float x = 0., y = 0.;
+
+  curand_init(2020, tid, 0, &states[tid]);  //   Initialize CURAND
+
+  for(int i = 0; i < TRIALS_PER_THREAD; i++)
+  {
+    x = curand_uniform (&states[tid]);
+    y = curand_uniform (&states[tid]);
+    points_in_circle += (x*x + y*y <= 1.0f); // count if x & y is in the circle.
+  }
+  estimate[tid] = 4.0f * points_in_circle / (float) TRIALS_PER_THREAD; // return estimate of pi
+}
+/* FIN QUESTION 6 */
+
+int main (int argc, char *argv[]) {
+  float h_counts[BLOCKS * THREADS] = { 0 };
+  double t0 = 0., t1 = 0., duration = 0.;
+
+  printf("# of trials per thread = %d, # of blocks = %d, # of threads/block = %d.\n", TRIALS_PER_THREAD, BLOCKS, THREADS);
+
+/* QUESTION 4 */
+  float *d_counts;
+  cudaMalloc((void **) &d_counts, BLOCKS * THREADS * sizeof(float)); // allocate device mem. for counts
+/* FIN QUESTION 4 */
+
+/* QUESTION 6 */
+  curandState *d_states;
+  cudaMalloc( (void **)&d_states, THREADS * BLOCKS * sizeof(curandState) );
+/* FIN QUESTION 6 */
+
+  t0 = get_elapsedtime();
+/* QUESTION 3 */
+  gpu_monte_carlo<<<BLOCKS, THREADS>>>(d_counts, d_states);
+/* FIN QUESTION 3*/
+
+/* QUESTION 5 */
+  cudaMemcpy(h_counts, d_counts, BLOCKS * THREADS * sizeof(float), cudaMemcpyDeviceToHost); // return results 
+/* FIN QUESTION 5 */
+
+  float pi_gpu = 0.f;
+  for(int i = 0; i < BLOCKS * THREADS; i++)
+  {
+    pi_gpu += h_counts[i];
+  }
+
+  pi_gpu /= (BLOCKS * THREADS);
+
+  t1 = get_elapsedtime();
+  duration = (t1 - t0);
+
+  printf("GPU pi calculated in %lf s.\n", duration);
+  fprintf(stdout, "Pi ~= %lf\n", pi_gpu);
+
+  return 0;
+}
diff --git a/TDs/TD2/CORRECTION/pi_cuda_shared.cu b/TDs/TD2/CORRECTION/pi_cuda_shared.cu
new file mode 100755
index 0000000000000000000000000000000000000000..53212f22f7af28aa646b7564efef5b26b37ea822
--- /dev/null
+++ b/TDs/TD2/CORRECTION/pi_cuda_shared.cu
@@ -0,0 +1,92 @@
+#include <stdlib.h>
+#include <stdio.h>
+#include <cuda.h>
+#include <math.h>
+#include <time.h>
+#include <curand_kernel.h>
+
+#define gettime(t) clock_gettime(CLOCK_MONOTONIC_RAW, t)
+#define get_sub_seconde(t) (1e-9*(double)t.tv_nsec)
+/** return time in second
+*/
+double get_elapsedtime(void)
+{
+  struct timespec st;
+  int err = gettime(&st);
+  if (err !=0) return 0;
+  return (double)st.tv_sec + get_sub_seconde(st);
+}
+
+/* QUESTION 3 */
+#define TRIALS_PER_THREAD 4096
+#define BLOCKS 512
+#define THREADS 256
+/* FIN QUESTION 3*/
+
+/* QUESTION 6 */
+__global__ void gpu_monte_carlo(float *estimate) {
+  unsigned int gtid = threadIdx.x + blockDim.x * blockIdx.x;
+  unsigned int tid = threadIdx.x;
+  int points_in_circle = 0;
+  float x = 0., y = 0.;
+  __shared__ float estimate_s[THREADS];
+  __shared__ curandState states_s[THREADS];
+
+  curand_init(2020, gtid, 0, &states_s[tid]);  //   Initialize CURAND
+
+  for(int i = 0; i < TRIALS_PER_THREAD; i++)
+  {
+    x = curand_uniform (&states_s[tid]);
+    y = curand_uniform (&states_s[tid]);
+    points_in_circle += (x*x + y*y <= 1.0f); // count if x & y is in the circle.
+  }
+  estimate_s[tid] = 4.0f * points_in_circle / (float) TRIALS_PER_THREAD; // return estimate of pi
+  __syncthreads();
+
+  for (unsigned int s=1; s < THREADS; s*=2) {
+    if (tid % (2*s) == 0)
+      estimate_s[tid] += estimate_s[tid + s];
+    __syncthreads();
+  }
+
+  if (tid == 0)
+    estimate[blockIdx.x] = estimate_s[0]; 
+}
+/* FIN QUESTION 6 */
+
+int main (int argc, char *argv[]) {
+  float h_counts[BLOCKS * THREADS] = { 0 };
+  double t0 = 0., t1 = 0., duration = 0.;
+
+  printf("# of trials per thread = %d, # of blocks = %d, # of threads/block = %d.\n", TRIALS_PER_THREAD, BLOCKS, THREADS);
+
+/* QUESTION 4 */
+  float *d_counts;
+  cudaMalloc((void **) &d_counts, BLOCKS * sizeof(float)); // allocate device mem. for counts
+/* FIN QUESTION 4 */
+
+  t0 = get_elapsedtime();
+/* QUESTION 3 */
+  gpu_monte_carlo<<<BLOCKS, THREADS>>>(d_counts);
+/* FIN QUESTION 3*/
+
+/* QUESTION 5 */
+  cudaMemcpy(h_counts, d_counts, BLOCKS  * sizeof(float), cudaMemcpyDeviceToHost); // return results 
+/* FIN QUESTION 5 */
+
+  float pi_gpu = 0.f;
+  for(int i = 0; i < BLOCKS; i++)
+  {
+    pi_gpu += h_counts[i];
+  }
+
+  pi_gpu /= BLOCKS * THREADS;
+
+  t1 = get_elapsedtime();
+  duration = (t1 - t0);
+
+  printf("GPU pi calculated in %lf s.\n", duration);
+  fprintf(stdout, "Pi ~= %lf\n", pi_gpu);
+
+  return 0;
+}
diff --git a/TDs/TD2/CORRECTION/pi_cuda_shared_2.cu b/TDs/TD2/CORRECTION/pi_cuda_shared_2.cu
new file mode 100755
index 0000000000000000000000000000000000000000..6b8a10556e63b09eb366b4e209997d98612ce4a0
--- /dev/null
+++ b/TDs/TD2/CORRECTION/pi_cuda_shared_2.cu
@@ -0,0 +1,93 @@
+#include <stdlib.h>
+#include <stdio.h>
+#include <cuda.h>
+#include <math.h>
+#include <time.h>
+#include <curand_kernel.h>
+
+#define gettime(t) clock_gettime(CLOCK_MONOTONIC_RAW, t)
+#define get_sub_seconde(t) (1e-9*(double)t.tv_nsec)
+/** return time in second
+*/
+double get_elapsedtime(void)
+{
+  struct timespec st;
+  int err = gettime(&st);
+  if (err !=0) return 0;
+  return (double)st.tv_sec + get_sub_seconde(st);
+}
+
+/* QUESTION 3 */
+#define TRIALS_PER_THREAD 4096
+#define BLOCKS 512
+#define THREADS 256
+/* FIN QUESTION 3*/
+
+/* QUESTION 6 */
+__global__ void gpu_monte_carlo(float *estimate) {
+  unsigned int gtid = threadIdx.x + blockDim.x * blockIdx.x;
+  unsigned int tid = threadIdx.x;
+  int points_in_circle = 0;
+  float x = 0., y = 0.;
+  __shared__ float estimate_s[THREADS];
+  __shared__ curandState states_s[THREADS];
+
+  curand_init(2020, gtid, 0, &states_s[tid]);  //   Initialize CURAND
+
+  for(int i = 0; i < TRIALS_PER_THREAD; i++)
+  {
+    x = curand_uniform (&states_s[tid]);
+    y = curand_uniform (&states_s[tid]);
+    points_in_circle += (x*x + y*y <= 1.0f); // count if x & y is in the circle.
+  }
+  estimate_s[tid] = 4.0f * points_in_circle / (float) TRIALS_PER_THREAD; // return estimate of pi
+  __syncthreads();
+
+  for (unsigned int s=1; s < THREADS; s*=2) {
+    int index = 2 * s * tid;
+    if (index < THREADS)
+      estimate_s[tid] += estimate_s[tid + s];
+    __syncthreads();
+  }
+
+  if (tid == 0)
+    estimate[blockIdx.x] = estimate_s[0]; 
+}
+/* FIN QUESTION 6 */
+
+int main (int argc, char *argv[]) {
+  float h_counts[BLOCKS * THREADS] = { 0 };
+  double t0 = 0., t1 = 0., duration = 0.;
+
+  printf("# of trials per thread = %d, # of blocks = %d, # of threads/block = %d.\n", TRIALS_PER_THREAD, BLOCKS, THREADS);
+
+/* QUESTION 4 */
+  float *d_counts;
+  cudaMalloc((void **) &d_counts, BLOCKS * sizeof(float)); // allocate device mem. for counts
+/* FIN QUESTION 4 */
+
+  t0 = get_elapsedtime();
+/* QUESTION 3 */
+  gpu_monte_carlo<<<BLOCKS, THREADS>>>(d_counts);
+/* FIN QUESTION 3*/
+
+/* QUESTION 5 */
+  cudaMemcpy(h_counts, d_counts, BLOCKS  * sizeof(float), cudaMemcpyDeviceToHost); // return results 
+/* FIN QUESTION 5 */
+
+  float pi_gpu = 0.f;
+  for(int i = 0; i < BLOCKS; i++)
+  {
+    pi_gpu += h_counts[i];
+  }
+
+  pi_gpu /= BLOCKS * THREADS;
+
+  t1 = get_elapsedtime();
+  duration = (t1 - t0);
+
+  printf("GPU pi calculated in %lf s.\n", duration);
+  fprintf(stdout, "Pi ~= %lf\n", pi_gpu);
+
+  return 0;
+}
diff --git a/TDs/TD2/CORRECTION/pi_cuda_shared_3.cu b/TDs/TD2/CORRECTION/pi_cuda_shared_3.cu
new file mode 100755
index 0000000000000000000000000000000000000000..8213f5af128395c5730800990fc8e5c9a2a2bf55
--- /dev/null
+++ b/TDs/TD2/CORRECTION/pi_cuda_shared_3.cu
@@ -0,0 +1,92 @@
+#include <stdlib.h>
+#include <stdio.h>
+#include <cuda.h>
+#include <math.h>
+#include <time.h>
+#include <curand_kernel.h>
+
+#define gettime(t) clock_gettime(CLOCK_MONOTONIC_RAW, t)
+#define get_sub_seconde(t) (1e-9*(double)t.tv_nsec)
+/** return time in second
+*/
+double get_elapsedtime(void)
+{
+  struct timespec st;
+  int err = gettime(&st);
+  if (err !=0) return 0;
+  return (double)st.tv_sec + get_sub_seconde(st);
+}
+
+/* QUESTION 3 */
+#define TRIALS_PER_THREAD 4096
+#define BLOCKS 512
+#define THREADS 256
+/* FIN QUESTION 3*/
+
+/* QUESTION 6 */
+__global__ void gpu_monte_carlo(float *estimate) {
+  unsigned int gtid = threadIdx.x + blockDim.x * blockIdx.x;
+  unsigned int tid = threadIdx.x;
+  int points_in_circle = 0;
+  float x = 0., y = 0.;
+  __shared__ float estimate_s[THREADS];
+  __shared__ curandState states_s[THREADS];
+
+  curand_init(2020, gtid, 0, &states_s[tid]);  //   Initialize CURAND
+
+  for(int i = 0; i < TRIALS_PER_THREAD; i++)
+  {
+    x = curand_uniform (&states_s[tid]);
+    y = curand_uniform (&states_s[tid]);
+    points_in_circle += (x*x + y*y <= 1.0f); // count if x & y is in the circle.
+  }
+  estimate_s[tid] = 4.0f * points_in_circle / (float) TRIALS_PER_THREAD; // return estimate of pi
+  __syncthreads();
+
+  for (unsigned int s=THREADS/2; s>0; s>>=1){
+    if (tid < s)
+      estimate_s[tid] += estimate_s[tid + s];
+    __syncthreads();
+  }
+
+  if (tid == 0)
+    estimate[blockIdx.x] = estimate_s[0]; 
+}
+/* FIN QUESTION 6 */
+
+int main (int argc, char *argv[]) {
+  float h_counts[BLOCKS * THREADS] = { 0 };
+  double t0 = 0., t1 = 0., duration = 0.;
+
+  printf("# of trials per thread = %d, # of blocks = %d, # of threads/block = %d.\n", TRIALS_PER_THREAD, BLOCKS, THREADS);
+
+/* QUESTION 4 */
+  float *d_counts;
+  cudaMalloc((void **) &d_counts, BLOCKS * sizeof(float)); // allocate device mem. for counts
+/* FIN QUESTION 4 */
+
+  t0 = get_elapsedtime();
+/* QUESTION 3 */
+  gpu_monte_carlo<<<BLOCKS, THREADS>>>(d_counts);
+/* FIN QUESTION 3*/
+
+/* QUESTION 5 */
+  cudaMemcpy(h_counts, d_counts, BLOCKS  * sizeof(float), cudaMemcpyDeviceToHost); // return results 
+/* FIN QUESTION 5 */
+
+  float pi_gpu = 0.f;
+  for(int i = 0; i < BLOCKS; i++)
+  {
+    pi_gpu += h_counts[i];
+  }
+
+  pi_gpu /= BLOCKS * THREADS;
+
+  t1 = get_elapsedtime();
+  duration = (t1 - t0);
+
+  printf("GPU pi calculated in %lf s.\n", duration);
+  fprintf(stdout, "Pi ~= %lf\n", pi_gpu);
+
+  return 0;
+}
diff --git a/TDs/TD2/CORRECTION/pi_cuda_shared_4.cu b/TDs/TD2/CORRECTION/pi_cuda_shared_4.cu
new file mode 100755
index 0000000000000000000000000000000000000000..3a92945d90f641cd3d890a794fa0057bc69c1594
--- /dev/null
+++ b/TDs/TD2/CORRECTION/pi_cuda_shared_4.cu
@@ -0,0 +1,86 @@
+#include <stdlib.h>
+#include <stdio.h>
+#include <cuda.h>
+#include <math.h>
+#include <time.h>
+#include <curand_kernel.h>
+
+#define gettime(t) clock_gettime(CLOCK_MONOTONIC_RAW, t)
+#define get_sub_seconde(t) (1e-9*(double)t.tv_nsec)
+/** return time in second
+*/
+double get_elapsedtime(void)
+{
+  struct timespec st;
+  int err = gettime(&st);
+  if (err !=0) return 0;
+  return (double)st.tv_sec + get_sub_seconde(st);
+}
+
+/* QUESTION 3 */
+#define TRIALS_PER_THREAD 4096
+#define BLOCKS 512
+#define THREADS 256
+/* FIN QUESTION 3*/
+
+/* QUESTION 6 */
+__global__ void gpu_monte_carlo(float *estimate) {
+  unsigned int gtid = threadIdx.x + blockDim.x * blockIdx.x;
+  unsigned int tid = threadIdx.x;
+  int points_in_circle = 0;
+  float x = 0., y = 0.;
+  __shared__ float estimate_s[THREADS];
+  __shared__ curandState states_s[THREADS];
+
+  curand_init(2020, gtid, 0, &states_s[tid]);  //   Initialize CURAND
+
+  for(int i = 0; i < TRIALS_PER_THREAD; i++)
+  {
+    x = curand_uniform (&states_s[tid]);
+    y = curand_uniform (&states_s[tid]);
+    points_in_circle += (x*x + y*y <= 1.0f); // count if x & y is in the circle.
+  }
+  estimate_s[tid] = 4.0f * points_in_circle / (float) TRIALS_PER_THREAD; // return estimate of pi
+  __syncthreads();
+
+  for (unsigned int s=THREADS/2; s > 0; s>>=1) {
+    if (tid < s)
+      estimate_s[tid] += estimate_s[tid + s];
+    __syncthreads();
+  }
+
+  if (tid == 0)
+    atomicAdd(estimate, estimate_s[0]);
+}
+/* FIN QUESTION 6 */
+
+int main (int argc, char *argv[]) {
+  float h_counts = 0;
+  double t0 = 0., t1 = 0., duration = 0.;
+
+  printf("# of trials per thread = %d, # of blocks = %d, # of threads/block = %d.\n", TRIALS_PER_THREAD, BLOCKS, THREADS);
+
+/* QUESTION 4 */
+  float *d_counts;
+  cudaMalloc((void **) &d_counts, sizeof(float)); // allocate device mem. for counts
+/* FIN QUESTION 4 */
+
+  t0 = get_elapsedtime();
+/* QUESTION 3 */
+  gpu_monte_carlo<<<BLOCKS, THREADS>>>(d_counts);
+/* FIN QUESTION 3*/
+
+/* QUESTION 5 */
+  cudaMemcpy(&h_counts, d_counts, sizeof(float), cudaMemcpyDeviceToHost); // return results 
+/* FIN QUESTION 5 */
+
+  float pi_gpu = h_counts / (BLOCKS * THREADS);
+
+  t1 = get_elapsedtime();
+  duration = (t1 - t0);
+
+  printf("GPU pi calculated in %lf s.\n", duration);
+  fprintf(stdout, "Pi ~= %lf\n", pi_gpu);
+
+  return 0;
+}
diff --git a/TDs/TD2/SUJET/td2.pdf b/TDs/TD2/SUJET/td2.pdf
index 1f9480b357c5e3333f44d550ed2e0ef18113b5b1..c38d9f4cb310957d923abd90d94b241127af1175 100644
Binary files a/TDs/TD2/SUJET/td2.pdf and b/TDs/TD2/SUJET/td2.pdf differ
diff --git a/TPs/TP0/CODE/common/helper_cuda.h b/TPs/TP0/CODE/common/helper_cuda.h
index 3dd446dced85de9d10eb9f1597c56099fa358bdb..77ca0bf8a3091a41d462238e6b8cda540b6868c3 100644
--- a/TPs/TP0/CODE/common/helper_cuda.h
+++ b/TPs/TP0/CODE/common/helper_cuda.h
@@ -46,17 +46,16 @@
 #define MIN(a, b) (a < b ? a : b)
 #endif
 
-#define checkCudaErrors(val) check((val), #val, __FILE__, __LINE__)
 #define getLastCudaError(msg) __getLastCudaError(msg, __FILE__, __LINE__)
+#define checkCudaErrors(ans) { gpuAssert((ans), __FILE__, __LINE__); }
 
-void check(cudaError_t result, char const *const func, const char *const file,
-           int const line)
+inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=false)
 {
-  if (result)
-  {
-    fprintf(stderr, "CUDA error at %s:%d code=%d (%s) \"%s\" \n", file, line, (int)result, cudaGetErrorName(result), func);
-    exit(EXIT_FAILURE);
-  }
+   if (code != cudaSuccess)
+   {
+      fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
+      if (abort) exit(code);
+   }
 }
 
 inline void __getLastCudaError(const char *errorMessage, const char *file,
@@ -180,4 +179,4 @@ inline const char *_ConvertSMVer2ArchName(int major, int minor)
 
 // end of CUDA Helper Functions
 
-#endif // COMMON_HELPER_CUDA_H_
\ No newline at end of file
+#endif // COMMON_HELPER_CUDA_H_
diff --git a/TPs/TP0/CODE/compute-sanitizer/Makefile b/TPs/TP0/CODE/compute-sanitizer/Makefile
index 2283121ddd828e6e4f8750965ef592866b4acec1..098d87b12f2989aa389f02fddd4ca65df28bd525 100644
--- a/TPs/TP0/CODE/compute-sanitizer/Makefile
+++ b/TPs/TP0/CODE/compute-sanitizer/Makefile
@@ -1,8 +1,8 @@
 CC = nvcc -lineinfo
-INCLUDES = 
+INCLUDES = -I../common 
 
 # the build target executable:
-EXE = memcheck.exe initcheck.exe 
+EXE = memcheck.exe initcheck.exe out_of_bound.exe 
 
 all: $(EXE)
 
diff --git a/TPs/TP0/CODE/compute-sanitizer/memcheck.cu b/TPs/TP0/CODE/compute-sanitizer/memcheck.cu
index ad3fc59b16be296946be73084b4f19bb676c52fe..7cccf6ab69581d4cc18f137c90e460279a6be50e 100644
--- a/TPs/TP0/CODE/compute-sanitizer/memcheck.cu
+++ b/TPs/TP0/CODE/compute-sanitizer/memcheck.cu
@@ -4,7 +4,7 @@
 
 __global__ void saxpy(int n, float a, float *x, float *y) {
     int i = blockIdx.x * blockDim.x + threadIdx.x;
-    y[i] = a * x[i] + y[i];
+    if (i <= n) y[i] = a * x[i] + y[i];
 }
 
 int main(void) {
diff --git a/TPs/TP0/CODE/compute-sanitizer/out_of_bound.cu b/TPs/TP0/CODE/compute-sanitizer/out_of_bound.cu
new file mode 100644
index 0000000000000000000000000000000000000000..7ceaad48461a65759886060934cb5dd4fd5a4672
--- /dev/null
+++ b/TPs/TP0/CODE/compute-sanitizer/out_of_bound.cu
@@ -0,0 +1,13 @@
+#include "helper_cuda.h"
+
+__global__ void k1(char *d) { d[41 + 128] = 0; }
+
+int main() {
+    char *d;
+    checkCudaErrors(cudaMalloc(&d, 42));
+
+    k1<<<1, 1>>>(d);
+    checkCudaErrors(cudaGetLastError());
+    checkCudaErrors(cudaDeviceSynchronize());
+
+}
diff --git a/TPs/TP0/CODE/compute-sanitizer/tags b/TPs/TP0/CODE/compute-sanitizer/tags
index f41805ff57b31cfd930077730d066b2145261de1..3ca8122258a249f317cd0d4fe0ea2e13ff376c4e 100644
--- a/TPs/TP0/CODE/compute-sanitizer/tags
+++ b/TPs/TP0/CODE/compute-sanitizer/tags
@@ -4,6 +4,6 @@
 !_TAG_PROGRAM_NAME	Exuberant Ctags	//
 !_TAG_PROGRAM_URL	http://ctags.sourceforge.net	/official site/
 !_TAG_PROGRAM_VERSION	5.9~svn20110310	//
-CC	Makefile	/^CC = nvcc$/;"	m
-EXE	Makefile	/^EXE = memcheck.exe initcheck.exe $/;"	m
-INCLUDES	Makefile	/^INCLUDES = $/;"	m
+CC	Makefile	/^CC = nvcc -lineinfo$/;"	m
+EXE	Makefile	/^EXE = memcheck.exe initcheck.exe out_of_bound.exe $/;"	m
+INCLUDES	Makefile	/^INCLUDES = -I..\/common $/;"	m
diff --git a/TPs/TP0/CODE/device_query/Makefile b/TPs/TP0/CODE/device_query/Makefile
index 73b63d518ef203ed6ec1b19cbec0c553bc630a18..32e68d57b8d6fb95157ba9072e94601681308929 100644
--- a/TPs/TP0/CODE/device_query/Makefile
+++ b/TPs/TP0/CODE/device_query/Makefile
@@ -1,5 +1,8 @@
+#***********************************************************
+# Created by Patricio Bulic, Davor Sluga UL FRI on 15/6/2022
+#***********************************************************
 CC = nvcc
-INCLUDES = -I../common
+INCLUDES = -I.
 
 # the build target executable:
 TARGET = prog
diff --git a/TPs/TP0/CODE/error_checking/exemple1.cu b/TPs/TP0/CODE/error_checking/exemple1.cu
index 58d7a39197d61cebf8e174c86dde4d5c902214e4..963dc57662debaecf5def59991db80d4aa338a63 100755
--- a/TPs/TP0/CODE/error_checking/exemple1.cu
+++ b/TPs/TP0/CODE/error_checking/exemple1.cu
@@ -3,14 +3,19 @@
 #include "helper_cuda.h"
 
 #define THREADS 1024
-//#define THREADS 4096
-#define TAB_SIZE 8192
+//#define THREADS 2048
+#define TAB_SIZE 100000
 
 __global__ void kernel(int *a, int *b, int *c) {
     const int tid = blockIdx.x * blockDim.x + threadIdx.x;
     if (tid < TAB_SIZE) c[tid] = a[tid] + b[tid];
 }
 
+__global__ void init(int *a, int value) {
+    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (tid < TAB_SIZE) a[tid] = value;
+}
+
 int main(int argc, char **argv)
 {
     int sz_in_bytes = sizeof(int) * TAB_SIZE;
@@ -22,18 +27,23 @@ int main(int argc, char **argv)
     // Allocation on host (malloc)
     h_c = (int *)malloc(sz_in_bytes);
 
+    // Kernel configuration
+    dim3 dimBlock(THREADS, 1, 1);
+    dim3 dimGrid(TAB_SIZE / THREADS + 1, 1, 1);
+
     // Allocation on device (cudaMalloc)
     checkCudaErrors(cudaMalloc((void **)&d_a, sz_in_bytes));
     checkCudaErrors(cudaMalloc((void **)&d_b, sz_in_bytes));
     checkCudaErrors(cudaMalloc((void **)&d_c, sz_in_bytes));
 
-    checkCudaErrors(cudaMemset(d_a, 1, sz_in_bytes));
-    checkCudaErrors(cudaMemset(d_b, 2, sz_in_bytes));
-	cudaDeviceSynchronize();
+    init<<<dimGrid, dimBlock>>>(d_a, 1);
+    checkCudaErrors(cudaGetLastError());
+    checkCudaErrors(cudaDeviceSynchronize());
+
+    init<<<dimGrid, dimBlock>>>(d_b, 2);
+    checkCudaErrors(cudaGetLastError());
+    checkCudaErrors(cudaDeviceSynchronize());
 
-    // Kernel configuration
-    dim3 dimBlock(THREADS, 1, 1);
-    dim3 dimGrid(TAB_SIZE / THREADS + 1, 1, 1);
 
     // Kernel launch
     kernel<<<dimGrid, dimBlock>>>(d_a, d_b, d_c);
diff --git a/TPs/TP0/CODE/error_checking/exemple1.exe b/TPs/TP0/CODE/error_checking/exemple1.exe
deleted file mode 100755
index 92ff871b850ef8a9c61848b3b09fc28a55715118..0000000000000000000000000000000000000000
Binary files a/TPs/TP0/CODE/error_checking/exemple1.exe and /dev/null differ
diff --git a/TPs/TP0/CODE/error_checking/exemple2.cu b/TPs/TP0/CODE/error_checking/exemple2.cu
index f863dde23bdab204fd7aead195abe78a7876fe9e..ff6ccfb971fe255327eab462d38571a558887671 100755
--- a/TPs/TP0/CODE/error_checking/exemple2.cu
+++ b/TPs/TP0/CODE/error_checking/exemple2.cu
@@ -2,34 +2,43 @@
 #include <stdlib.h>
 #include "helper_cuda.h"
 
-#define THREADS 256
-#define TAB_SIZE 8192
+#define THREADS 64
+#define TAB_SIZE 1000
 
 __global__ void copy(int *a, int *b) {
-    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
-    if (tid <= TAB_SIZE) b[tid] = a[tid];
+	const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+	if (tid < TAB_SIZE) b[tid] = a[tid];
 }
 
+__global__ void init(int *a, int value) {
+	const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+	if (tid < TAB_SIZE) a[tid] = value;
+}
+
+
 int main(int argc, char **argv)
 {
-    int sz_in_bytes = sizeof(int) * TAB_SIZE;
+	int sz_in_bytes = sizeof(int) * TAB_SIZE;
 
     int *h_b;
     int res = 0;
     int *d_a, *d_b, *d_c;
 
-    // Allocation on host (malloc)
-    h_b = (int *)malloc(sz_in_bytes);
+	// Allocation on host (malloc)
+	h_b = (int *)malloc(sz_in_bytes);
 
-    // Allocation on device (cudaMalloc)
-    checkCudaErrors(cudaMalloc((void **)&d_a, sz_in_bytes));
-    checkCudaErrors(cudaMalloc((void **)&d_b, sz_in_bytes));
+	// Allocation on device (cudaMalloc)
+	checkCudaErrors(cudaMalloc((void **)&d_a, sz_in_bytes));
+	// suppose we forget this
+	// checkCudaErrors(cudaMalloc((void **)&d_b, sz_in_bytes));
 
-    checkCudaErrors(cudaMemset(d_a, 1, sz_in_bytes));
+	// Kernel configuration
+	dim3 dimBlock(THREADS, 1, 1);
+	dim3 dimGrid(TAB_SIZE / THREADS + 1, 1, 1);
 
-    // Kernel configuration
-    dim3 dimBlock(THREADS, 1, 1);
-    dim3 dimGrid(TAB_SIZE / THREADS + 1, 1, 1);
+	init<<<dimGrid, dimBlock>>>(d_a, 1);
+	checkCudaErrors(cudaGetLastError());
+	checkCudaErrors(cudaDeviceSynchronize());
 
     // Kernel launch
     copy<<<dimGrid, dimBlock>>>(d_a, d_b);
@@ -37,8 +46,8 @@ int main(int argc, char **argv)
 	checkCudaErrors(cudaDeviceSynchronize());
 	getLastCudaError("copy kernel error after sync");
 
-    // Retrieving data from device (cudaMemcpy)
-    checkCudaErrors(cudaMemcpy(h_b, d_b, sz_in_bytes, cudaMemcpyDeviceToHost));
+	// Retrieving data from device (cudaMemcpy)
+	checkCudaErrors(cudaMemcpy(h_b, d_b, sz_in_bytes, cudaMemcpyDeviceToHost));
 
 	// checking if cudamalloc is still available
     checkCudaErrors(cudaMalloc((void **)&d_c, sz_in_bytes));
@@ -48,19 +57,23 @@ int main(int argc, char **argv)
     checkCudaErrors(cudaFree(d_a));
     checkCudaErrors(cudaFree(d_b));
 
-    // computing sum of tab element
-    for (int i = 0; i < TAB_SIZE; i++) res += h_b[i];
+	// Freeing on device (cudaFree)
+	checkCudaErrors(cudaFree(d_a));
+	checkCudaErrors(cudaFree(d_b));
+
+	// computing sum of tab element
+	for (int i = 0; i < TAB_SIZE; i++) res += h_b[i];
 
-    // Verifying if
-    if (res == TAB_SIZE) {
-        fprintf(stderr, "TEST PASSED !\n");
-    }
-    else
-    {
-        fprintf(stderr, "TEST FAILED !\n");
-    }
+	// Verifying if
+	if (res == TAB_SIZE) {
+		fprintf(stderr, "TEST PASSED !\n");
+	}
+	else
+	{
+		fprintf(stderr, "TEST FAILED !\n");
+	}
 
-    free(h_b);
+	free(h_b);
 
-    return 0;
+	return 0;
 }
diff --git a/TPs/TP0/CODE/error_checking/exemple2.exe b/TPs/TP0/CODE/error_checking/exemple2.exe
deleted file mode 100755
index ea89892735b6df2a2e60d443515ec9de4c026f5a..0000000000000000000000000000000000000000
Binary files a/TPs/TP0/CODE/error_checking/exemple2.exe and /dev/null differ
diff --git a/TPs/TP0/CODE/error_checking/exemple3.cu b/TPs/TP0/CODE/error_checking/exemple3.cu
index d2a0c2ce1dd4eb0e8f55c29b89402f59146c1ef2..e8cd6a045f54f66412c1d6db9335489da58e666d 100755
--- a/TPs/TP0/CODE/error_checking/exemple3.cu
+++ b/TPs/TP0/CODE/error_checking/exemple3.cu
@@ -2,61 +2,72 @@
 #include <stdlib.h>
 #include "helper_cuda.h"
 
-#define THREADS 256
-#define TAB_SIZE 8192
+#define THREADS 64
+#define TAB_SIZE 1000
 
 __global__ void copy(int *a, int *b) {
-    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
-    if (tid < TAB_SIZE) b[tid] = a[tid];
+	const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+	if (tid < TAB_SIZE) b[tid] = a[tid];
 }
 
+__global__ void init(int *a, int value) {
+	const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+	if (tid < TAB_SIZE) a[tid] = value;
+}
+
+
 int main(int argc, char **argv)
 {
-    int sz_in_bytes = sizeof(int) * TAB_SIZE;
+	int sz_in_bytes = sizeof(int) * TAB_SIZE;
+
+	int *h_b;
+	int res = 0;
+	int *d_a, *d_b;
 
-    int *h_b;
-    int res = 0;
-    int *d_a, *d_b;
+	// Allocation on host (malloc)
+	h_b = (int *)malloc(sz_in_bytes);
 
-    // Allocation on host (malloc)
-    h_b = (int *)malloc(sz_in_bytes);
+	// Too big allocation on device
+	checkCudaErrors(cudaMalloc((void **)&d_a, 100000000000));
 
-    // Too big allocation on device
-    checkCudaErrors(cudaMalloc((void **)&d_a, 100000000000));
+	// Allocation on device (cudaMalloc)
+	checkCudaErrors(cudaMalloc((void **)&d_a, sz_in_bytes));
+	checkCudaErrors(cudaMalloc((void **)&d_b, sz_in_bytes));
 
-    // Allocation on device (cudaMalloc)
-    checkCudaErrors(cudaMalloc((void **)&d_a, sz_in_bytes));
-    checkCudaErrors(cudaMalloc((void **)&d_b, sz_in_bytes));
+	// Kernel configuration
+	dim3 dimBlock(THREADS, 1, 1);
+	dim3 dimGrid(TAB_SIZE / THREADS + 1, 1, 1);
 
-    checkCudaErrors(cudaMemset(d_a, 1, sz_in_bytes));
+	init<<<dimGrid, dimBlock>>>(d_a, 1);
 
-    // Kernel configuration
-    dim3 dimBlock(THREADS, 1, 1);
-    dim3 dimGrid(TAB_SIZE / THREADS + 1, 1, 1);
+	checkCudaErrors(cudaGetLastError());
+	checkCudaErrors(cudaDeviceSynchronize());
 
-    // Kernel launch
-    copy<<<dimGrid, dimBlock>>>(d_a, d_b);
+	// Kernel launch
+	copy<<<dimGrid, dimBlock>>>(d_a, d_b);
+	checkCudaErrors(cudaGetLastError());
+	checkCudaErrors(cudaDeviceSynchronize());
 
-    // Retrieving data from device (cudaMemcpy)
-    checkCudaErrors(cudaMemcpy(h_b, d_b, sz_in_bytes, cudaMemcpyDeviceToHost));
+	// Retrieving data from device (cudaMemcpy)
+	checkCudaErrors(cudaMemcpy(h_b, d_b, sz_in_bytes, cudaMemcpyDeviceToHost));
 
-    // Freeing on device (cudaFree)
-    checkCudaErrors(cudaFree(d_a));
-    checkCudaErrors(cudaFree(d_b));
+	// Freeing on device (cudaFree)
+	checkCudaErrors(cudaFree(d_a));
+	checkCudaErrors(cudaFree(d_b));
 
-    // computing sum of tab element
-    for (int i = 0; i < TAB_SIZE; i++) res += h_b[i];
+	// computing sum of tab element
+	for (int i = 0; i < TAB_SIZE; i++) res += h_b[i];
 
-    // Verifying if
-    if (res == TAB_SIZE) {
-        fprintf(stderr, "TEST PASSED !\n");
-    }
-    else
-    {
-        fprintf(stderr, "TEST FAILED !\n");
-    }
+	// Verifying if
+	if (res == TAB_SIZE) {
+		fprintf(stderr, "TEST PASSED !\n");
+	}
+	else
+	{
+		fprintf(stderr, "TEST FAILED !\n");
+	}
 
-    free(h_b);
+	free(h_b);
 
-    return 0;
+	return 0;
 }
diff --git a/TPs/TP0/CODE/error_checking/exemple3.exe b/TPs/TP0/CODE/error_checking/exemple3.exe
deleted file mode 100755
index 247981307c9bfe2e5aa932c7eeea35b63056f71b..0000000000000000000000000000000000000000
Binary files a/TPs/TP0/CODE/error_checking/exemple3.exe and /dev/null differ
diff --git a/TPs/TP0/CORRECTION/device_query/Makefile b/TPs/TP0/CORRECTION/device_query/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..5264cf81a352e7036ca6fa7565b7736350881936
--- /dev/null
+++ b/TPs/TP0/CORRECTION/device_query/Makefile
@@ -0,0 +1,17 @@
+CC = nvcc
+INCLUDES = -I.
+
+# the build target executable:
+TARGET = prog
+
+all: $(TARGET)
+
+$(TARGET): $(TARGET).o
+	$(CC) $(TARGET).o -o $@ 
+
+$(TARGET).o: $(TARGET).cu
+	$(CC) $(INCLUDES) $(TARGET).cu -c -o $@
+
+clean:
+	$(RM) $(TARGET) $(TARGET).o
+	
diff --git a/TPs/TP0/CORRECTION/device_query/helper_cuda.h b/TPs/TP0/CORRECTION/device_query/helper_cuda.h
new file mode 100644
index 0000000000000000000000000000000000000000..3dd446dced85de9d10eb9f1597c56099fa358bdb
--- /dev/null
+++ b/TPs/TP0/CORRECTION/device_query/helper_cuda.h
@@ -0,0 +1,183 @@
+/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+////////////////////////////////////////////////////////////////////////////////
+// These are CUDA Helper functions for initialization and error checking
+
+#ifndef COMMON_HELPER_CUDA_H_
+#define COMMON_HELPER_CUDA_H_
+
+#pragma once
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#ifndef MAX
+#define MAX(a, b) (a > b ? a : b)
+#endif
+
+#ifndef MIN
+#define MIN(a, b) (a < b ? a : b)
+#endif
+
+#define checkCudaErrors(val) check((val), #val, __FILE__, __LINE__)
+#define getLastCudaError(msg) __getLastCudaError(msg, __FILE__, __LINE__)
+
+void check(cudaError_t result, char const *const func, const char *const file,
+           int const line)
+{
+  if (result)
+  {
+    fprintf(stderr, "CUDA error at %s:%d code=%d (%s) \"%s\" \n", file, line, (int)result, cudaGetErrorName(result), func);
+    exit(EXIT_FAILURE);
+  }
+}
+
+inline void __getLastCudaError(const char *errorMessage, const char *file,
+                               const int line)
+{
+  cudaError_t err = cudaGetLastError();
+
+  if (cudaSuccess != err)
+  {
+    fprintf(stderr,
+            "%s(%i) : getLastCudaError() CUDA error :"
+            " %s : (%d) %s.\n",
+            file, line, errorMessage, (int)(err),
+            cudaGetErrorString(err));
+    exit(EXIT_FAILURE);
+  }
+}
+
+// Beginning of GPU Architecture definitions
+int _ConvertSMVer2Cores(int major, int minor)
+{
+  // Defines for GPU Architecture types (using the SM version to determine
+  // the # of cores per SM
+  typedef struct
+  {
+    int SM; // 0xMm (hexidecimal notation), M = SM Major version,
+    // and m = SM minor version
+    int Cores;
+  } sSMtoCores;
+
+  sSMtoCores nGpuArchCoresPerSM[] = {
+      {0x30, 192},
+      {0x32, 192},
+      {0x35, 192},
+      {0x37, 192},
+      {0x50, 128},
+      {0x52, 128},
+      {0x53, 128},
+      {0x60, 64},
+      {0x61, 128},
+      {0x62, 128},
+      {0x70, 64},
+      {0x72, 64},
+      {0x75, 64},
+      {0x80, 64},
+      {0x86, 128},
+      {0x87, 128},
+      {-1, -1}};
+
+  int index = 0;
+
+  while (nGpuArchCoresPerSM[index].SM != -1)
+  {
+    if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor))
+    {
+      return nGpuArchCoresPerSM[index].Cores;
+    }
+
+    index++;
+  }
+
+  // If we don't find the values, we default use the previous one
+  // to run properly
+  printf(
+      "MapSMtoCores for SM %d.%d is undefined."
+      "  Default to use %d Cores/SM\n",
+      major, minor, nGpuArchCoresPerSM[index - 1].Cores);
+  return nGpuArchCoresPerSM[index - 1].Cores;
+}
+
+inline const char *_ConvertSMVer2ArchName(int major, int minor)
+{
+  // Defines for GPU Architecture types (using the SM version to determine
+  // the GPU Arch name)
+  typedef struct
+  {
+    int SM; // 0xMm (hexidecimal notation), M = SM Major version,
+    // and m = SM minor version
+    const char *name;
+  } sSMtoArchName;
+
+  sSMtoArchName nGpuArchNameSM[] = {
+      {0x30, "Kepler"},
+      {0x32, "Kepler"},
+      {0x35, "Kepler"},
+      {0x37, "Kepler"},
+      {0x50, "Maxwell"},
+      {0x52, "Maxwell"},
+      {0x53, "Maxwell"},
+      {0x60, "Pascal"},
+      {0x61, "Pascal"},
+      {0x62, "Pascal"},
+      {0x70, "Volta"},
+      {0x72, "Xavier"},
+      {0x75, "Turing"},
+      {0x80, "Ampere"},
+      {0x86, "Ampere"},
+      {-1, "Graphics Device"}};
+
+  int index = 0;
+
+  while (nGpuArchNameSM[index].SM != -1)
+  {
+    if (nGpuArchNameSM[index].SM == ((major << 4) + minor))
+    {
+      return nGpuArchNameSM[index].name;
+    }
+
+    index++;
+  }
+
+  // If we don't find the values, we default use the previous one
+  // to run properly
+  printf(
+      "MapSMtoArchName for SM %d.%d is undefined."
+      "  Default to use %s\n",
+      major, minor, nGpuArchNameSM[index - 1].name);
+  return nGpuArchNameSM[index - 1].name;
+}
+// end of GPU Architecture definitions
+
+// end of CUDA Helper Functions
+
+#endif // COMMON_HELPER_CUDA_H_
\ No newline at end of file
diff --git a/TPs/TP0/CORRECTION/device_query/prog.cu b/TPs/TP0/CORRECTION/device_query/prog.cu
new file mode 100644
index 0000000000000000000000000000000000000000..9700ec2d363038a2667245aec5f9706ded410b48
--- /dev/null
+++ b/TPs/TP0/CORRECTION/device_query/prog.cu
@@ -0,0 +1,115 @@
+#include <stdio.h>
+#include <cuda_runtime.h>
+#include <cuda.h>
+
+#include "helper_cuda.h"
+
+int main(int argc, char **argv) {
+  
+  // Get number of GPUs
+  int deviceCount = 0;
+  cudaError_t error = cudaGetDeviceCount(&deviceCount);
+
+  if (error != cudaSuccess) {
+    printf("cudaGetDeviceCount error %d\n-> %s\n", error, cudaGetErrorString(error));
+    exit(EXIT_FAILURE);
+  }
+
+  // Get device propreties and print 
+  for (int dev = 0; dev < deviceCount; dev++) {
+    struct cudaDeviceProp prop;
+    int value;
+    printf("\n==========  cudaDeviceGetProperties ============  \n");
+    cudaGetDeviceProperties(&prop, dev);
+    printf("\nDevice %d: \"%s\"\n", dev, prop.name);
+    printf("  GPU Clock Rate (MHz):                          %d\n", prop.clockRate/1000);
+    printf("  Memory Clock Rate (MHz):                       %d\n", prop.memoryClockRate/1000);
+    printf("  Memory Bus Width (bits):                       %d\n", prop.memoryBusWidth);
+    printf("  Peak Memory Bandwidth (GB/s):                  %.2f\n",
+           2.0 * prop.memoryClockRate * (prop.memoryBusWidth / 8) / 1.0e6);
+    printf("  CUDA Cores/MP:                                 %d\n", _ConvertSMVer2Cores(prop.major, prop.minor));
+    printf("  CUDA Cores:                                    %d\n", _ConvertSMVer2Cores(prop.major, prop.minor) *
+           prop.multiProcessorCount);
+    printf("  Peak TFLOPS:                                    %.1f\n", 
+            (_ConvertSMVer2Cores(prop.major, prop.minor) *
+           2.0 * prop.multiProcessorCount * prop.clockRate * 1000.0)/ 1.0e12);
+    printf("  Total amount of global memory:                 %.0f GB\n", prop.totalGlobalMem / 1073741824.0f);
+    printf("  Total amount of shared memory per block:       %zu kB\n",
+           prop.sharedMemPerBlock/1024);
+    printf("  Total number of registers available per block: %d\n",
+           prop.regsPerBlock);
+    printf("  Warp size:                                     %d\n",
+           prop.warpSize);
+    printf("  Maximum number of threads per block:           %d\n",
+           prop.maxThreadsPerBlock);
+    printf("  Max dimension size of a thread block (x,y,z): (%d, %d, %d)\n",
+           prop.maxThreadsDim[0], prop.maxThreadsDim[1],
+           prop.maxThreadsDim[2]);
+    printf("  Max dimension size of a grid size    (x,y,z): (%d, %d, %d)\n",
+           prop.maxGridSize[0], prop.maxGridSize[1],
+           prop.maxGridSize[2]);
+
+    printf("\n\n==========  cudaDeviceGetAttribute ============  \n");
+    printf("\nDevice %d: \"%s\"\n", dev, prop.name);
+    cudaDeviceGetAttribute (&value, cudaDevAttrMaxThreadsPerBlock, dev);
+    printf("  Max number of threads per block:              %d\n",
+           value);
+    cudaDeviceGetAttribute (&value, cudaDevAttrMaxBlockDimX, dev);
+    printf("  Max block dimension X:                        %d\n",
+           value);
+    cudaDeviceGetAttribute (&value, cudaDevAttrMaxBlockDimY, dev);
+    printf("  Max block dimension Y:                        %d\n",
+           value);
+    cudaDeviceGetAttribute (&value, cudaDevAttrMaxBlockDimZ, dev);
+    printf("  Max block dimension Z:                        %d\n",
+           value);
+    cudaDeviceGetAttribute (&value, cudaDevAttrMaxGridDimX, dev);
+    printf("  Max grid dimension X:                         %d\n",
+           value);
+    cudaDeviceGetAttribute (&value, cudaDevAttrMaxGridDimY, dev);
+    printf("  Max grid dimension Y:                         %d\n",
+           value);
+    cudaDeviceGetAttribute (&value, cudaDevAttrMaxGridDimZ, dev);
+    printf("  Max grid dimension Z:                         %d\n",
+           value);
+    cudaDeviceGetAttribute (&value, cudaDevAttrMaxSharedMemoryPerBlock, dev);
+    printf("  Max shared memory per block:                  %d\n",
+           value);
+    cudaDeviceGetAttribute (&value, cudaDevAttrWarpSize, dev);
+    printf("  Warp size:                                    %d\n",
+           value);      
+    cudaDeviceGetAttribute (&value, cudaDevAttrClockRate, dev);
+    printf("  Peak clock frequency in kilohertz:            %d\n",
+           value);
+    cudaDeviceGetAttribute (&value, cudaDevAttrMemoryClockRate, dev);
+    printf("  Peak memory clock frequency in kilohertz:     %d\n",
+           value);
+    cudaDeviceGetAttribute (&value, cudaDevAttrGlobalMemoryBusWidth, dev);
+    printf("  Global memory bus width in bits:              %d\n",
+           value);
+    cudaDeviceGetAttribute (&value, cudaDevAttrL2CacheSize, dev);
+    printf("  Size of L2 cache in bytes:                    %d\n",
+           value);
+    cudaDeviceGetAttribute (&value, cudaDevAttrMaxThreadsPerMultiProcessor, dev);
+    printf("  Maximum resident threads per SM:              %d\n",
+           value);
+    cudaDeviceGetAttribute (&value, cudaDevAttrComputeCapabilityMajor, dev);
+    printf("  Major compute capability version number:      %d\n",
+           value);
+    cudaDeviceGetAttribute (&value, cudaDevAttrComputeCapabilityMinor, dev);
+    printf("  Minor compute capability version number:      %d\n",
+           value);
+    cudaDeviceGetAttribute (&value, cudaDevAttrMaxSharedMemoryPerMultiprocessor, dev);
+    printf("  Max shared memory per SM in bytes:            %d\n",
+           value);
+    cudaDeviceGetAttribute (&value, cudaDevAttrMaxRegistersPerMultiprocessor, dev);
+    printf("  Max number of 32-bit registers per SM:        %d\n",
+           value);  
+    cudaDeviceGetAttribute (&value, cudaDevAttrMaxSharedMemoryPerBlockOptin, dev);
+    printf("  Max per block shared mem size on the device:  %d\n",
+           value);  
+    cudaDeviceGetAttribute (&value, cudaDevAttrMaxBlocksPerMultiprocessor, dev);
+    printf("  Max thread blocks that can reside on a SM:    %d\n",
+           value);  
+  }
+}
diff --git a/TPs/TP0/CORRECTION/device_query/tags b/TPs/TP0/CORRECTION/device_query/tags
new file mode 100644
index 0000000000000000000000000000000000000000..74edb7e8f6999f738763c9a1b17486ab326d5c8b
--- /dev/null
+++ b/TPs/TP0/CORRECTION/device_query/tags
@@ -0,0 +1,18 @@
+!_TAG_FILE_FORMAT	2	/extended format; --format=1 will not append ;" to lines/
+!_TAG_FILE_SORTED	1	/0=unsorted, 1=sorted, 2=foldcase/
+!_TAG_PROGRAM_AUTHOR	Darren Hiebert	/dhiebert@users.sourceforge.net/
+!_TAG_PROGRAM_NAME	Exuberant Ctags	//
+!_TAG_PROGRAM_URL	http://ctags.sourceforge.net	/official site/
+!_TAG_PROGRAM_VERSION	5.9~svn20110310	//
+CC	Makefile	/^CC = nvcc$/;"	m
+COMMON_HELPER_CUDA_H_	helper_cuda.h	32;"	d
+INCLUDES	Makefile	/^INCLUDES = -I.$/;"	m
+MAX	helper_cuda.h	42;"	d
+MIN	helper_cuda.h	46;"	d
+TARGET	Makefile	/^TARGET = prog$/;"	m
+_ConvertSMVer2ArchName	helper_cuda.h	/^inline const char *_ConvertSMVer2ArchName(int major, int minor)$/;"	f
+_ConvertSMVer2Cores	helper_cuda.h	/^int _ConvertSMVer2Cores(int major, int minor)$/;"	f
+__getLastCudaError	helper_cuda.h	/^inline void __getLastCudaError(const char *errorMessage, const char *file,$/;"	f
+check	helper_cuda.h	/^void check(cudaError_t result, char const *const func, const char *const file,$/;"	f
+checkCudaErrors	helper_cuda.h	49;"	d
+getLastCudaError	helper_cuda.h	50;"	d
diff --git a/TPs/TP0/CORRECTION/error_checking/exemple2.cu b/TPs/TP0/CORRECTION/error_checking/exemple2.cu
new file mode 100755
index 0000000000000000000000000000000000000000..783478f375efe2ff0eaaf295f1d9fe3bf9d1a595
--- /dev/null
+++ b/TPs/TP0/CORRECTION/error_checking/exemple2.cu
@@ -0,0 +1,72 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include "helper_cuda.h"
+
+#define THREADS 64
+#define TAB_SIZE 1000
+
+__global__ void copy(int *a, int *b) {
+	const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+	if (tid < TAB_SIZE) b[tid] = a[tid];
+}
+
+__global__ void init(int *a, int value) {
+	const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+	if (tid < TAB_SIZE) a[tid] = value;
+}
+
+
+int main(int argc, char **argv)
+{
+	int sz_in_bytes = sizeof(int) * TAB_SIZE;
+
+	int *h_b;
+	int res = 0;
+	int *d_a, *d_b, *d_dummy;
+
+	// Allocation on host (malloc)
+	h_b = (int *)malloc(sz_in_bytes);
+
+	// Allocation on device (cudaMalloc)
+	checkCudaErrors(cudaMalloc((void **)&d_a, sz_in_bytes));
+	// suppose we forget this
+	// checkCudaErrors(cudaMalloc((void **)&d_b, sz_in_bytes));
+
+	// Kernel configuration
+	dim3 dimBlock(THREADS, 1, 1);
+	dim3 dimGrid(TAB_SIZE / THREADS + 1, 1, 1);
+
+	init<<<dimGrid, dimBlock>>>(d_a, 1);
+
+	checkCudaErrors(cudaGetLastError());
+	checkCudaErrors(cudaDeviceSynchronize());
+
+	// Kernel launch
+	copy<<<dimGrid, dimBlock>>>(d_a, d_b);
+	checkCudaErrors(cudaMalloc((void **)&d_dummy, sz_in_bytes));
+	checkCudaErrors(cudaGetLastError());
+	checkCudaErrors(cudaDeviceSynchronize());
+
+	// Retrieving data from device (cudaMemcpy)
+	checkCudaErrors(cudaMemcpy(h_b, d_b, sz_in_bytes, cudaMemcpyDeviceToHost));
+
+	// Freeing on device (cudaFree)
+	checkCudaErrors(cudaFree(d_a));
+	checkCudaErrors(cudaFree(d_b));
+
+	// computing sum of tab element
+	for (int i = 0; i < TAB_SIZE; i++) res += h_b[i];
+
+	// Verifying if
+	if (res == TAB_SIZE) {
+		fprintf(stderr, "TEST PASSED !\n");
+	}
+	else
+	{
+		fprintf(stderr, "TEST FAILED !\n");
+	}
+
+	free(h_b);
+
+	return 0;
+}
diff --git a/TPs/TP0/CORRECTION/tags b/TPs/TP0/CORRECTION/tags
new file mode 100644
index 0000000000000000000000000000000000000000..8de213abb5cdc780fe0f2a444355bf977dbb19d1
--- /dev/null
+++ b/TPs/TP0/CORRECTION/tags
@@ -0,0 +1,18 @@
+!_TAG_FILE_FORMAT	2	/extended format; --format=1 will not append ;" to lines/
+!_TAG_FILE_SORTED	1	/0=unsorted, 1=sorted, 2=foldcase/
+!_TAG_PROGRAM_AUTHOR	Darren Hiebert	/dhiebert@users.sourceforge.net/
+!_TAG_PROGRAM_NAME	Exuberant Ctags	//
+!_TAG_PROGRAM_URL	http://ctags.sourceforge.net	/official site/
+!_TAG_PROGRAM_VERSION	5.9~svn20110310	//
+CC	device_query/Makefile	/^CC = nvcc$/;"	m
+COMMON_HELPER_CUDA_H_	device_query/helper_cuda.h	32;"	d
+INCLUDES	device_query/Makefile	/^INCLUDES = -I.$/;"	m
+MAX	device_query/helper_cuda.h	42;"	d
+MIN	device_query/helper_cuda.h	46;"	d
+TARGET	device_query/Makefile	/^TARGET = prog$/;"	m
+_ConvertSMVer2ArchName	device_query/helper_cuda.h	/^inline const char *_ConvertSMVer2ArchName(int major, int minor)$/;"	f
+_ConvertSMVer2Cores	device_query/helper_cuda.h	/^int _ConvertSMVer2Cores(int major, int minor)$/;"	f
+__getLastCudaError	device_query/helper_cuda.h	/^inline void __getLastCudaError(const char *errorMessage, const char *file,$/;"	f
+check	device_query/helper_cuda.h	/^void check(cudaError_t result, char const *const func, const char *const file,$/;"	f
+checkCudaErrors	device_query/helper_cuda.h	49;"	d
+getLastCudaError	device_query/helper_cuda.h	50;"	d
diff --git a/TPs/TP0/SUJET/tp0.pdf b/TPs/TP0/SUJET/tp0.pdf
index 9061738fe9e2dcb8996cd01a87506887027f81d9..b81d4ccc5c85f52bb976921c8d02fc19c164fc82 100644
Binary files a/TPs/TP0/SUJET/tp0.pdf and b/TPs/TP0/SUJET/tp0.pdf differ