diff --git a/TPs/TP1/CORRECTION/Partie2/dgemm_7.cu b/TPs/TP1/CORRECTION/Partie2/dgemm_7.cu
index 7e6abed0a36d1032a6a2f178ba302e4f4bfa69f0..3d2ae823aee66c06f3ec31f9ce648510d3f8a097 100755
--- a/TPs/TP1/CORRECTION/Partie2/dgemm_7.cu
+++ b/TPs/TP1/CORRECTION/Partie2/dgemm_7.cu
@@ -33,22 +33,18 @@ int verify_matrix(double *matRef, double *matOut, int N) {
   }
   return 0;
 }
+void init(double* A, double* B, double* C, int size) {
+    int i = 0, j = 0;
 
-void init(double* A, double* B, double* C, int size)
-{
-  int i = 0, j = 0;
-
-	srand(2019);
+    srand(2019);
 
-	for(i = 0; i < size; i++)
-  {
-		for(j = 0; j < size; j++)
-    {
-			A[i * size + j] = rand();
-			B[i * size + j] = rand();
-			C[i * size + j] = 0.0;
-		}
-	}
+    for (i = 0; i < size; i++) {
+        for (j = 0; j < size; j++) {
+            A[i * size + j] = (double)(rand() % 10) + 0.01 * (rand() % 5);
+            B[i * size + j] = (double)(rand() % 10) + 0.01 * (rand() % 5);
+            C[i * size + j] = 0.0;
+        }
+    }
 }
 
 void mult(double* A, double* B, double* C, int size)
diff --git a/TPs/TP1/CORRECTION/Partie3/dgemm_coalescing_1D.cu b/TPs/TP1/CORRECTION/Partie3/dgemm_coalescing_1D.cu
index 459789977f5d39de2aba3e4bbd9990cf23e54a2a..036df166c7152add35715c4dd45e18e11f4d2c8c 100755
--- a/TPs/TP1/CORRECTION/Partie3/dgemm_coalescing_1D.cu
+++ b/TPs/TP1/CORRECTION/Partie3/dgemm_coalescing_1D.cu
@@ -34,21 +34,18 @@ int verify_matrix(double *matRef, double *matOut, int N) {
   return 0;
 }
 
-void init(double* A, double* B, double* C, int size)
-{
-  int i = 0, j = 0;
+void init(double* A, double* B, double* C, int size) {
+    int i = 0, j = 0;
 
-	srand(2019);
+    srand(2019);
 
-	for(i = 0; i < size; i++)
-  {
-		for(j = 0; j < size; j++)
-    {
-			A[i * size + j] = rand();
-			B[i * size + j] = rand();
-			C[i * size + j] = 0.0;
-		}
-	}
+    for (i = 0; i < size; i++) {
+        for (j = 0; j < size; j++) {
+            A[i * size + j] = (double)(rand() % 10) + 0.01 * (rand() % 5);
+            B[i * size + j] = (double)(rand() % 10) + 0.01 * (rand() % 5);
+            C[i * size + j] = 0.0;
+        }
+    }
 }
 
 void mult(double* A, double* B, double* C, int size)
diff --git a/TPs/TP1/CORRECTION/Partie3/dgemm_coalescing_2D.cu b/TPs/TP1/CORRECTION/Partie3/dgemm_coalescing_2D.cu
index 93a076bd84ef97f8c0313a9ad362fc8adaffdc86..68d35f3ab81bcb3a4cc3a1aef9e17e31fc0102b1 100755
--- a/TPs/TP1/CORRECTION/Partie3/dgemm_coalescing_2D.cu
+++ b/TPs/TP1/CORRECTION/Partie3/dgemm_coalescing_2D.cu
@@ -33,22 +33,18 @@ int verify_matrix(double *matRef, double *matOut, int N) {
   }
   return 0;
 }
+void init(double* A, double* B, double* C, int size) {
+    int i = 0, j = 0;
 
-void init(double* A, double* B, double* C, int size)
-{
-  int i = 0, j = 0;
-
-	srand(2019);
+    srand(2019);
 
-	for(i = 0; i < size; i++)
-  {
-		for(j = 0; j < size; j++)
-    {
-			A[i * size + j] = rand();
-			B[i * size + j] = rand();
-			C[i * size + j] = 0.0;
-		}
-	}
+    for (i = 0; i < size; i++) {
+        for (j = 0; j < size; j++) {
+            A[i * size + j] = (double)(rand() % 10) + 0.01 * (rand() % 5);
+            B[i * size + j] = (double)(rand() % 10) + 0.01 * (rand() % 5);
+            C[i * size + j] = 0.0;
+        }
+    }
 }
 
 void mult(double* A, double* B, double* C, int size)
diff --git a/TPs/TP1/CORRECTION/Partie4/dgemm_shared_1D.cu b/TPs/TP1/CORRECTION/Partie4/dgemm_shared_1D.cu
index 05ea42ff7ce0526bee4413902474e6fc8faa8b71..28b03b6924ef63c52a3f03344d88896cb02aef09 100755
--- a/TPs/TP1/CORRECTION/Partie4/dgemm_shared_1D.cu
+++ b/TPs/TP1/CORRECTION/Partie4/dgemm_shared_1D.cu
@@ -10,88 +10,85 @@
 #define gettime(t) clock_gettime(CLOCK_MONOTONIC_RAW, t)
 #define get_sub_seconde(t) (1e-9*(double)t.tv_nsec)
 /** return time in second
-*/
+ */
 double get_elapsedtime(void)
 {
-  struct timespec st;
-  int err = gettime(&st);
-  if (err !=0) return 0;
-  return (double)st.tv_sec + get_sub_seconde(st);
+    struct timespec st;
+    int err = gettime(&st);
+    if (err !=0) return 0;
+    return (double)st.tv_sec + get_sub_seconde(st);
 }
 
 int verify_matrix(double *matRef, double *matOut, int N) {
-  double diff = 0.0;
-  int i;
-  for (i = 0; i < N*N; i++) {
-    diff = fabs(matRef[i] - matOut[i]);
-    if (diff > 0.01) {
-      printf("Divergence! Should %5.2f, Is %5.2f (Diff %5.2f) at %d\n",
-          matRef[i], matOut[i], diff, i);
-      return 1;
+    double diff = 0.0;
+    int i;
+    for (i = 0; i < N*N; i++) {
+        diff = fabs(matRef[i] - matOut[i]);
+        if (diff > 0.01) {
+            printf("Divergence! Should %5.2f, Is %5.2f (Diff %5.2f) at %d\n",
+                    matRef[i], matOut[i], diff, i);
+            return 1;
+        }
     }
-  }
-  return 0;
+    return 0;
 }
 
-void init(double* A, double* B, double* C, int size)
-{
-  int i = 0, j = 0;
+void init(double* A, double* B, double* C, int size) {
+    int i = 0, j = 0;
 
-	srand(2019);
+    srand(2019);
 
-	for(i = 0; i < size; i++)
-  {
-		for(j = 0; j < size; j++)
-    {
-			A[i * size + j] = rand();
-			B[i * size + j] = rand();
-			C[i * size + j] = 0.0;
-		}
-	}
+    for (i = 0; i < size; i++) {
+        for (j = 0; j < size; j++) {
+            A[i * size + j] = (double)(rand() % 10) + 0.01 * (rand() % 5);
+            B[i * size + j] = (double)(rand() % 10) + 0.01 * (rand() % 5);
+            C[i * size + j] = 0.0;
+        }
+    }
 }
 
 void mult(double* A, double* B, double* C, int size)
 {
-  int i = 0, j = 0, k = 0;
+    int i = 0, j = 0, k = 0;
 
-	for(i = 0; i < size; i++)
-  {
-		for(j = 0; j < size; j++)
+    for(i = 0; i < size; i++)
     {
-      double sum = 0.;
-			for(k = 0; k < size; k++)
-      {
-			  sum += A[i * size + k] * B[k * size + j];
-			}
-      C[i * size + j] = sum;
-		}
-	}
+        for(j = 0; j < size; j++)
+        {
+            double sum = 0.;
+            for(k = 0; k < size; k++)
+            {
+                sum += A[i * size + k] * B[k * size + j];
+            }
+            C[i * size + j] = sum;
+        }
+    }
 }
 
-__global__
-	void MulMatrixKernel(double* A, double* B, double* C, int N)
-	{
-		int col		= threadIdx.x + blockDim.x * blockIdx.x;
-		int ligne	= threadIdx.y + blockDim.y * blockIdx.y;
+    __global__
+void MulMatrixKernel(double* A, double* B, double* C, int N)
+{
+    int col		= threadIdx.x + blockDim.x * blockIdx.x;
+    int ligne	= threadIdx.y + blockDim.y * blockIdx.y;
 
-		if((col < N) && (ligne < N))
+    if((col < N) && (ligne < N))
     {
-			double val = 0.0f;
-			for(int k = 0; k < N; k++)
-      {
-				val += A[ligne * N + k] * B[k * N + col];
-			}
-			C[ligne * N + col] = val;
-		}
-	}
+        double val = 0.0f;
+        for(int k = 0; k < N; k++)
+        {
+            val += A[ligne * N + k] * B[k * N + col];
+        }
+        C[ligne * N + col] = val;
+    }
+}
 
 __global__
-	void MulMatrixShare(double* A, double* B, double* C, int N){
+void MulMatrixShare(double* A, double* B, double* C, int N){
 
     // tuile en shared memory sous format row major
     // dimensions de la tuile = dimension du block de thread
-		__shared__ double A_s[BLOCK_WIDTH * BLOCK_WIDTH];
-		__shared__ double B_s[BLOCK_WIDTH * BLOCK_WIDTH];
+    __shared__ double A_s[BLOCK_WIDTH * BLOCK_WIDTH];
+    __shared__ double B_s[BLOCK_WIDTH * BLOCK_WIDTH];
 
     // indices des premières ligne et colonne calculées par le bloc de thread
     int blockRow = BLOCK_WIDTH * blockIdx.y;
@@ -103,122 +100,122 @@ __global__
 
     // indices de la valeur de C calculé par le thread
 
-		double value = 0.0f;
+    double value = 0.0f;
 
-		for(int tile_offset = 0; tile_offset < N; tile_offset+=BLOCK_WIDTH)
+    for(int tile_offset = 0; tile_offset < N; tile_offset+=BLOCK_WIDTH)
     {
 
-      A_s[BLOCK_WIDTH * threadRow + threadCol] = A[(blockRow /* première ligne traitée par le bloc de thread */
-                                                  + threadRow) * N /* décalage à la ligne traitée par le thread */ 
-                                                  + tile_offset  /* décalage  à la tuile courante */
-                                                  + threadCol]; /* dcalage à la colonne traitée par le thread */
+        A_s[BLOCK_WIDTH * threadRow + threadCol] = A[(blockRow /* première ligne traitée par le bloc de thread */
+                + threadRow) * N /* décalage à la ligne traitée par le thread */ 
+            + tile_offset  /* décalage  à la tuile courante */
+            + threadCol]; /* dcalage à la colonne traitée par le thread */
 
-      B_s[BLOCK_WIDTH * threadRow + threadCol] = B[tile_offset * N /* décalage à la ligne tuile courante */
-                                                  + threadRow * N /* décalage à la ligne du thread */
-                                                  + blockCol /* décalage à la colonne du bloc */
-                                                  + threadCol]; /* décalage à la colonne du thread */
+        B_s[BLOCK_WIDTH * threadRow + threadCol] = B[tile_offset * N /* décalage à la ligne tuile courante */
+            + threadRow * N /* décalage à la ligne du thread */
+            + blockCol /* décalage à la colonne du bloc */
+            + threadCol]; /* décalage à la colonne du thread */
 
-      // Attente que tous les threads ont bien chargé dans la mémoire partagée leurs deux indices
-			__syncthreads();
+        // Attente que tous les threads ont bien chargé dans la mémoire partagée leurs deux indices
+        __syncthreads();
 
-			for(int k =0; k < BLOCK_WIDTH; k++)
-      {
-				value += A_s[threadRow * BLOCK_WIDTH + k] * B_s[k * BLOCK_WIDTH + threadCol];
-      }
+        for(int k =0; k < BLOCK_WIDTH; k++)
+        {
+            value += A_s[threadRow * BLOCK_WIDTH + k] * B_s[k * BLOCK_WIDTH + threadCol];
+        }
 
-      // S'assurer que tous les threads ont bien fini le calcul du préliminaire du tile courant avant de commencer la prochaine étape du calcul de cette tile
-			__syncthreads();
-		}
+        // S'assurer que tous les threads ont bien fini le calcul du préliminaire du tile courant avant de commencer la prochaine étape du calcul de cette tile
+        __syncthreads();
+    }
 
     // Enregistrer la valeur accumulée dans C (mémoire globale)
-		C[(blockRow + threadRow) * N  + blockCol + threadCol] = value;
+    C[(blockRow + threadRow) * N  + blockCol + threadCol] = value;
 }
 
 int main(int argc, char** argv)
 {
-	int N, use_cpu;
-
-	N = (argc < 2)?1000:atoi(argv[1]);
-	use_cpu = (argc < 3)?0:atoi(argv[2]);
-
-  double t0 = 0., t1 = 0., duration = 0.;
-
-	int nbBlocks = N / BLOCK_WIDTH;
-	//if(N % BLOCK_WIDTH) nbBlocks++;
-	if(N % BLOCK_WIDTH) N += (N % BLOCK_WIDTH);
-	dim3 gridSize(nbBlocks, nbBlocks);
-	dim3 blockSize(BLOCK_WIDTH * BLOCK_WIDTH);
-
-	double *A, *B, *C, *C_ref;
-	double *d_A, *d_B, *d_C;
-
-	cudaEvent_t start, stop;
-	cudaEventCreate(&start);
-	cudaEventCreate(&stop);
-
-	A = (double*) malloc(sizeof(double) * N * N);
-	B = (double*) malloc(sizeof(double) * N * N);
-	C = (double*) malloc(sizeof(double) * N * N);
-  if (use_cpu)
-  	C_ref = (double*) malloc(sizeof(double) * N * N);
-
-	cudaMalloc(&d_A, sizeof(double) * N * N);
-	cudaMalloc(&d_B, sizeof(double) * N * N);
-	cudaMalloc(&d_C, sizeof(double) * N * N);
-
-	// Value initialization
-  init(A, B, C, N);
-
-	cudaMemcpy(d_A, A, sizeof(double) * N * N, cudaMemcpyHostToDevice);
-	cudaMemcpy(d_B, B, sizeof(double) * N * N, cudaMemcpyHostToDevice);
-	cudaMemcpy(d_C, C, sizeof(double) * N * N, cudaMemcpyHostToDevice);
-
-	cudaEventRecord(start);
-	//MulMatrixKernel<<<gridSize, blockSize>>>(d_A, d_B, d_C, N);
-	MulMatrixShare<<<gridSize, blockSize>>>(d_A, d_B, d_C, N);
-	cudaEventRecord(stop);
-
-	cudaMemcpy(C, d_C, sizeof(double) * N * N, cudaMemcpyDeviceToHost);
-
-  // Compute multiplication on cpu
-  if (use_cpu){
-    t0 = get_elapsedtime();
-    mult(A, B, C_ref, N);
-    t1 = get_elapsedtime();
-  }
-
-  // get gpu elapsed time
-  cudaEventSynchronize(stop);
-  float gpu_duration = 0;
-  cudaEventElapsedTime(&gpu_duration, start, stop);
-  cudaEventDestroy(start);
-  cudaEventDestroy(stop);
-
-  // Pretty print
-  uint64_t N_64 = (uint64_t) N;
-  uint64_t nb_op = 2* N_64 * N_64 * N_64;
-
-  if (use_cpu){
-    duration = (t1 - t0);
-    fprintf(stdout, "Performance results: \n");
-	  fprintf(stdout, "  Time: %lf s\n", duration);
-    fprintf(stdout, "  MFlops: %.2f\n", (nb_op / duration)*1E-6);
-
-    if (verify_matrix(C, C_ref, N) != 0){
-      fprintf(stderr, "Wrong result !\n");
+    int N, use_cpu;
+
+    N = (argc < 2)?1000:atoi(argv[1]);
+    use_cpu = (argc < 3)?0:atoi(argv[2]);
+
+    double t0 = 0., t1 = 0., duration = 0.;
+
+    int nbBlocks = N / BLOCK_WIDTH;
+    //if(N % BLOCK_WIDTH) nbBlocks++;
+    if(N % BLOCK_WIDTH) N += (N % BLOCK_WIDTH);
+    dim3 gridSize(nbBlocks, nbBlocks);
+    dim3 blockSize(BLOCK_WIDTH * BLOCK_WIDTH);
+
+    double *A, *B, *C, *C_ref;
+    double *d_A, *d_B, *d_C;
+
+    cudaEvent_t start, stop;
+    cudaEventCreate(&start);
+    cudaEventCreate(&stop);
+
+    A = (double*) malloc(sizeof(double) * N * N);
+    B = (double*) malloc(sizeof(double) * N * N);
+    C = (double*) malloc(sizeof(double) * N * N);
+    if (use_cpu)
+        C_ref = (double*) malloc(sizeof(double) * N * N);
+
+    cudaMalloc(&d_A, sizeof(double) * N * N);
+    cudaMalloc(&d_B, sizeof(double) * N * N);
+    cudaMalloc(&d_C, sizeof(double) * N * N);
+
+    // Value initialization
+    init(A, B, C, N);
+
+    cudaMemcpy(d_A, A, sizeof(double) * N * N, cudaMemcpyHostToDevice);
+    cudaMemcpy(d_B, B, sizeof(double) * N * N, cudaMemcpyHostToDevice);
+    cudaMemcpy(d_C, C, sizeof(double) * N * N, cudaMemcpyHostToDevice);
+
+    cudaEventRecord(start);
+    //MulMatrixKernel<<<gridSize, blockSize>>>(d_A, d_B, d_C, N);
+    MulMatrixShare<<<gridSize, blockSize>>>(d_A, d_B, d_C, N);
+    cudaEventRecord(stop);
+
+    cudaMemcpy(C, d_C, sizeof(double) * N * N, cudaMemcpyDeviceToHost);
+
+    // Compute multiplication on cpu
+    if (use_cpu){
+        t0 = get_elapsedtime();
+        mult(A, B, C_ref, N);
+        t1 = get_elapsedtime();
+    }
+
+    // get gpu elapsed time
+    cudaEventSynchronize(stop);
+    float gpu_duration = 0;
+    cudaEventElapsedTime(&gpu_duration, start, stop);
+    cudaEventDestroy(start);
+    cudaEventDestroy(stop);
+
+    // Pretty print
+    uint64_t N_64 = (uint64_t) N;
+    uint64_t nb_op = 2* N_64 * N_64 * N_64;
+
+    if (use_cpu){
+        duration = (t1 - t0);
+        fprintf(stdout, "Performance results: \n");
+        fprintf(stdout, "  Time: %lf s\n", duration);
+        fprintf(stdout, "  MFlops: %.2f\n", (nb_op / duration)*1E-6);
+
+        if (verify_matrix(C, C_ref, N) != 0){
+            fprintf(stderr, "Wrong result !\n");
+        }
     }
-  }
-  fprintf(stdout, "Performance results (GPU): \n");
-  fprintf(stdout, "  Time: %lf s\n", gpu_duration*1E-3);
-  fprintf(stdout, "  GFlops: %.2f\n", (nb_op / gpu_duration)*1E-6);
+    fprintf(stdout, "Performance results (GPU): \n");
+    fprintf(stdout, "  Time: %lf s\n", gpu_duration*1E-3);
+    fprintf(stdout, "  GFlops: %.2f\n", (nb_op / gpu_duration)*1E-6);
 
-  cudaFree(d_A);
-	cudaFree(d_B);
-	cudaFree(d_C);
+    cudaFree(d_A);
+    cudaFree(d_B);
+    cudaFree(d_C);
 
-	free(A);
-	free(B);
-	free(C);
+    free(A);
+    free(B);
+    free(C);
 
-	return 0;
+    return 0;
 }
diff --git a/TPs/TP1/CORRECTION/Partie4/dgemm_shared_2D.cu b/TPs/TP1/CORRECTION/Partie4/dgemm_shared_2D.cu
index 519ae89cc7c58ea69d5fa29e021a3515836525be..bb9f2024484e4a56b5622b7e50c6d38704f157f8 100755
--- a/TPs/TP1/CORRECTION/Partie4/dgemm_shared_2D.cu
+++ b/TPs/TP1/CORRECTION/Partie4/dgemm_shared_2D.cu
@@ -10,88 +10,85 @@
 #define gettime(t) clock_gettime(CLOCK_MONOTONIC_RAW, t)
 #define get_sub_seconde(t) (1e-9*(double)t.tv_nsec)
 /** return time in second
-*/
+ */
 double get_elapsedtime(void)
 {
-  struct timespec st;
-  int err = gettime(&st);
-  if (err !=0) return 0;
-  return (double)st.tv_sec + get_sub_seconde(st);
+    struct timespec st;
+    int err = gettime(&st);
+    if (err !=0) return 0;
+    return (double)st.tv_sec + get_sub_seconde(st);
 }
 
 int verify_matrix(double *matRef, double *matOut, int N) {
-  double diff = 0.0;
-  int i;
-  for (i = 0; i < N*N; i++) {
-    diff = fabs(matRef[i] - matOut[i]);
-    if (diff > 0.01) {
-      printf("Divergence! Should %5.2f, Is %5.2f (Diff %5.2f) at %d\n",
-          matRef[i], matOut[i], diff, i);
-      return 1;
+    double diff = 0.0;
+    int i;
+    for (i = 0; i < N*N; i++) {
+        diff = fabs(matRef[i] - matOut[i]);
+        if (diff > 0.01) {
+            printf("Divergence! Should %5.2f, Is %5.2f (Diff %5.2f) at %d\n",
+                    matRef[i], matOut[i], diff, i);
+            return 1;
+        }
     }
-  }
-  return 0;
+    return 0;
 }
 
-void init(double* A, double* B, double* C, int size)
-{
-  int i = 0, j = 0;
+void init(double* A, double* B, double* C, int size) {
+    int i = 0, j = 0;
 
-	srand(2019);
+    srand(2019);
 
-	for(i = 0; i < size; i++)
-  {
-		for(j = 0; j < size; j++)
-    {
-			A[i * size + j] = rand();
-			B[i * size + j] = rand();
-			C[i * size + j] = 0.0;
-		}
-	}
+    for (i = 0; i < size; i++) {
+        for (j = 0; j < size; j++) {
+            A[i * size + j] = (double)(rand() % 10) + 0.01 * (rand() % 5);
+            B[i * size + j] = (double)(rand() % 10) + 0.01 * (rand() % 5);
+            C[i * size + j] = 0.0;
+        }
+    }
 }
 
 void mult(double* A, double* B, double* C, int size)
 {
-  int i = 0, j = 0, k = 0;
+    int i = 0, j = 0, k = 0;
 
-	for(i = 0; i < size; i++)
-  {
-		for(j = 0; j < size; j++)
+    for(i = 0; i < size; i++)
     {
-      double sum = 0.;
-			for(k = 0; k < size; k++)
-      {
-			  sum += A[i * size + k] * B[k * size + j];
-			}
-      C[i * size + j] = sum;
-		}
-	}
+        for(j = 0; j < size; j++)
+        {
+            double sum = 0.;
+            for(k = 0; k < size; k++)
+            {
+                sum += A[i * size + k] * B[k * size + j];
+            }
+            C[i * size + j] = sum;
+        }
+    }
 }
 
-__global__
-	void MulMatrixKernel(double* A, double* B, double* C, int N)
-	{
-		int col		= threadIdx.x + blockDim.x * blockIdx.x;
-		int ligne	= threadIdx.y + blockDim.y * blockIdx.y;
+    __global__
+void MulMatrixKernel(double* A, double* B, double* C, int N)
+{
+    int col		= threadIdx.x + blockDim.x * blockIdx.x;
+    int ligne	= threadIdx.y + blockDim.y * blockIdx.y;
 
-		if((col < N) && (ligne < N))
+    if((col < N) && (ligne < N))
     {
-			double val = 0.0f;
-			for(int k = 0; k < N; k++)
-      {
-				val += A[ligne * N + k] * B[k * N + col];
-			}
-			C[ligne * N + col] = val;
-		}
-	}
+        double val = 0.0f;
+        for(int k = 0; k < N; k++)
+        {
+            val += A[ligne * N + k] * B[k * N + col];
+        }
+        C[ligne * N + col] = val;
+    }
+}
 
 __global__
-	void MulMatrixShare(double* A, double* B, double* C, int N){
+void MulMatrixShare(double* A, double* B, double* C, int N){
 
     // tuile en shared memory sous format row major
     // dimensions de la tuile = dimension du block de thread
-		__shared__ double A_s[BLOCK_WIDTH * BLOCK_WIDTH];
-		__shared__ double B_s[BLOCK_WIDTH * BLOCK_WIDTH];
+    __shared__ double A_s[BLOCK_WIDTH * BLOCK_WIDTH];
+    __shared__ double B_s[BLOCK_WIDTH * BLOCK_WIDTH];
 
     // indices des premières ligne et colonne calculées par le bloc de thread
     int blockRow = BLOCK_WIDTH * blockIdx.y;
@@ -103,122 +100,122 @@ __global__
 
     // indices de la valeur de C calculé par le thread
 
-		double value = 0.0f;
+    double value = 0.0f;
 
-		for(int tile_offset = 0; tile_offset < N; tile_offset+=BLOCK_WIDTH)
+    for(int tile_offset = 0; tile_offset < N; tile_offset+=BLOCK_WIDTH)
     {
 
-      A_s[BLOCK_WIDTH * threadRow + threadCol] = A[(blockRow /* première ligne traitée par le bloc de thread */
-                                                  + threadRow) * N /* décalage à la ligne traitée par le thread */ 
-                                                  + tile_offset  /* décalage  à la tuile courante */
-                                                  + threadCol]; /* dcalage à la colonne traitée par le thread */
+        A_s[BLOCK_WIDTH * threadRow + threadCol] = A[(blockRow /* première ligne traitée par le bloc de thread */
+                + threadRow) * N /* décalage à la ligne traitée par le thread */ 
+            + tile_offset  /* décalage  à la tuile courante */
+            + threadCol]; /* dcalage à la colonne traitée par le thread */
 
-      B_s[BLOCK_WIDTH * threadRow + threadCol] = B[tile_offset * N /* décalage à la ligne tuile courante */
-                                                  + threadRow * N /* décalage à la ligne du thread */
-                                                  + blockCol /* décalage à la colonne du bloc */
-                                                  + threadCol]; /* décalage à la colonne du thread */
+        B_s[BLOCK_WIDTH * threadRow + threadCol] = B[tile_offset * N /* décalage à la ligne tuile courante */
+            + threadRow * N /* décalage à la ligne du thread */
+            + blockCol /* décalage à la colonne du bloc */
+            + threadCol]; /* décalage à la colonne du thread */
 
-      // Attente que tous les threads ont bien chargé dans la mémoire partagée leurs deux indices
-			__syncthreads();
+        // Attente que tous les threads ont bien chargé dans la mémoire partagée leurs deux indices
+        __syncthreads();
 
-			for(int k =0; k < BLOCK_WIDTH; k++)
-      {
-				value += A_s[threadRow * BLOCK_WIDTH + k] * B_s[k * BLOCK_WIDTH + threadCol];
-      }
+        for(int k =0; k < BLOCK_WIDTH; k++)
+        {
+            value += A_s[threadRow * BLOCK_WIDTH + k] * B_s[k * BLOCK_WIDTH + threadCol];
+        }
 
-      // S'assurer que tous les threads ont bien fini le calcul du préliminaire du tile courant avant de commencer la prochaine étape du calcul de cette tile
-			__syncthreads();
-		}
+        // S'assurer que tous les threads ont bien fini le calcul du préliminaire du tile courant avant de commencer la prochaine étape du calcul de cette tile
+        __syncthreads();
+    }
 
     // Enregistrer la valeur accumulée dans C (mémoire globale)
-		C[(blockRow + threadRow) * N  + blockCol + threadCol] = value;
+    C[(blockRow + threadRow) * N  + blockCol + threadCol] = value;
 }
 
 int main(int argc, char** argv)
 {
-	int N, use_cpu;
-
-	N = (argc < 2)?1000:atoi(argv[1]);
-	use_cpu = (argc < 3)?0:atoi(argv[2]);
-
-  double t0 = 0., t1 = 0., duration = 0.;
-
-	int nbBlocks = N / BLOCK_WIDTH;
-	//if(N % BLOCK_WIDTH) nbBlocks++;
-	if(N % BLOCK_WIDTH) N += (N % BLOCK_WIDTH);
-	dim3 gridSize(nbBlocks, nbBlocks);
-	dim3 blockSize(BLOCK_WIDTH, BLOCK_WIDTH);
-
-	double *A, *B, *C, *C_ref;
-	double *d_A, *d_B, *d_C;
-
-	cudaEvent_t start, stop;
-	cudaEventCreate(&start);
-	cudaEventCreate(&stop);
-
-	A = (double*) malloc(sizeof(double) * N * N);
-	B = (double*) malloc(sizeof(double) * N * N);
-	C = (double*) malloc(sizeof(double) * N * N);
-  if (use_cpu)
-  	C_ref = (double*) malloc(sizeof(double) * N * N);
-
-	cudaMalloc(&d_A, sizeof(double) * N * N);
-	cudaMalloc(&d_B, sizeof(double) * N * N);
-	cudaMalloc(&d_C, sizeof(double) * N * N);
-
-	// Value initialization
-  init(A, B, C, N);
-
-	cudaMemcpy(d_A, A, sizeof(double) * N * N, cudaMemcpyHostToDevice);
-	cudaMemcpy(d_B, B, sizeof(double) * N * N, cudaMemcpyHostToDevice);
-	cudaMemcpy(d_C, C, sizeof(double) * N * N, cudaMemcpyHostToDevice);
-
-	cudaEventRecord(start);
-	//MulMatrixKernel<<<gridSize, blockSize>>>(d_A, d_B, d_C, N);
-	MulMatrixShare<<<gridSize, blockSize>>>(d_A, d_B, d_C, N);
-	cudaEventRecord(stop);
-
-	cudaMemcpy(C, d_C, sizeof(double) * N * N, cudaMemcpyDeviceToHost);
-
-  // Compute multiplication on cpu
-  if (use_cpu){
-    t0 = get_elapsedtime();
-    mult(A, B, C_ref, N);
-    t1 = get_elapsedtime();
-  }
-
-  // get gpu elapsed time
-  cudaEventSynchronize(stop);
-  float gpu_duration = 0;
-  cudaEventElapsedTime(&gpu_duration, start, stop);
-  cudaEventDestroy(start);
-  cudaEventDestroy(stop);
-
-  // Pretty print
-  uint64_t N_64 = (uint64_t) N;
-  uint64_t nb_op = 2* N_64 * N_64 * N_64;
-
-  if (use_cpu){
-    duration = (t1 - t0);
-    fprintf(stdout, "Performance results: \n");
-	  fprintf(stdout, "  Time: %lf s\n", duration);
-    fprintf(stdout, "  MFlops: %.2f\n", (nb_op / duration)*1E-6);
-
-    if (verify_matrix(C, C_ref, N) != 0){
-      fprintf(stderr, "Wrong result !\n");
+    int N, use_cpu;
+
+    N = (argc < 2)?1000:atoi(argv[1]);
+    use_cpu = (argc < 3)?0:atoi(argv[2]);
+
+    double t0 = 0., t1 = 0., duration = 0.;
+
+    int nbBlocks = N / BLOCK_WIDTH;
+    //if(N % BLOCK_WIDTH) nbBlocks++;
+    if(N % BLOCK_WIDTH) N += (N % BLOCK_WIDTH);
+    dim3 gridSize(nbBlocks, nbBlocks);
+    dim3 blockSize(BLOCK_WIDTH, BLOCK_WIDTH);
+
+    double *A, *B, *C, *C_ref;
+    double *d_A, *d_B, *d_C;
+
+    cudaEvent_t start, stop;
+    cudaEventCreate(&start);
+    cudaEventCreate(&stop);
+
+    A = (double*) malloc(sizeof(double) * N * N);
+    B = (double*) malloc(sizeof(double) * N * N);
+    C = (double*) malloc(sizeof(double) * N * N);
+    if (use_cpu)
+        C_ref = (double*) malloc(sizeof(double) * N * N);
+
+    cudaMalloc(&d_A, sizeof(double) * N * N);
+    cudaMalloc(&d_B, sizeof(double) * N * N);
+    cudaMalloc(&d_C, sizeof(double) * N * N);
+
+    // Value initialization
+    init(A, B, C, N);
+
+    cudaMemcpy(d_A, A, sizeof(double) * N * N, cudaMemcpyHostToDevice);
+    cudaMemcpy(d_B, B, sizeof(double) * N * N, cudaMemcpyHostToDevice);
+    cudaMemcpy(d_C, C, sizeof(double) * N * N, cudaMemcpyHostToDevice);
+
+    cudaEventRecord(start);
+    //MulMatrixKernel<<<gridSize, blockSize>>>(d_A, d_B, d_C, N);
+    MulMatrixShare<<<gridSize, blockSize>>>(d_A, d_B, d_C, N);
+    cudaEventRecord(stop);
+
+    cudaMemcpy(C, d_C, sizeof(double) * N * N, cudaMemcpyDeviceToHost);
+
+    // Compute multiplication on cpu
+    if (use_cpu){
+        t0 = get_elapsedtime();
+        mult(A, B, C_ref, N);
+        t1 = get_elapsedtime();
+    }
+
+    // get gpu elapsed time
+    cudaEventSynchronize(stop);
+    float gpu_duration = 0;
+    cudaEventElapsedTime(&gpu_duration, start, stop);
+    cudaEventDestroy(start);
+    cudaEventDestroy(stop);
+
+    // Pretty print
+    uint64_t N_64 = (uint64_t) N;
+    uint64_t nb_op = 2* N_64 * N_64 * N_64;
+
+    if (use_cpu){
+        duration = (t1 - t0);
+        fprintf(stdout, "Performance results: \n");
+        fprintf(stdout, "  Time: %lf s\n", duration);
+        fprintf(stdout, "  MFlops: %.2f\n", (nb_op / duration)*1E-6);
+
+        if (verify_matrix(C, C_ref, N) != 0){
+            fprintf(stderr, "Wrong result !\n");
+        }
     }
-  }
-  fprintf(stdout, "Performance results (GPU): \n");
-  fprintf(stdout, "  Time: %lf s\n", gpu_duration*1E-3);
-  fprintf(stdout, "  GFlops: %.2f\n", (nb_op / gpu_duration)*1E-6);
+    fprintf(stdout, "Performance results (GPU): \n");
+    fprintf(stdout, "  Time: %lf s\n", gpu_duration*1E-3);
+    fprintf(stdout, "  GFlops: %.2f\n", (nb_op / gpu_duration)*1E-6);
 
-  cudaFree(d_A);
-	cudaFree(d_B);
-	cudaFree(d_C);
+    cudaFree(d_A);
+    cudaFree(d_B);
+    cudaFree(d_C);
 
-	free(A);
-	free(B);
-	free(C);
+    free(A);
+    free(B);
+    free(C);
 
-	return 0;
+    return 0;
 }