diff --git a/TPs/TP0/CODE/device_query/prog.cu b/TPs/TP0/CODE/device_query/prog.cu
index 07d8b357d74230ad627ba43f6524a30b12e5f751..d717ee3b94e62d8ff1c31e2bd227a9ee025440d0 100644
--- a/TPs/TP0/CODE/device_query/prog.cu
+++ b/TPs/TP0/CODE/device_query/prog.cu
@@ -25,8 +25,8 @@ int main(int argc, char **argv) {
     printf("  GPU Clock Rate (MHz):                          %d\n", prop.clockRate/1000);
     printf("  Memory Clock Rate (MT/s):                      %d\n", prop.memoryClockRate/1000);
     printf("  Memory Bus Width (bits):                       %d\n", prop.memoryBusWidth);
-	printf("  Memory Band width (Gio/s):                     %d\n",
-	(((prop.memoryClockRate / 1024) * (prop.memoryBusWidth / 8)) / 1024) * 2);
+	printf("  Memory Band width (Go/s):                      %d\n",
+	(((prop.memoryClockRate / 1000) * (prop.memoryBusWidth / 8)) / 1000));
     printf("  CUDA Cores/MP:                                 %d\n", _ConvertSMVer2Cores(prop.major, prop.minor));
     printf("  CUDA Cores:                                    %d\n", _ConvertSMVer2Cores(prop.major, prop.minor) *
            prop.multiProcessorCount);
diff --git a/TPs/TP1/CODE/Partie2/Makefile b/TPs/TP1/CODE/Partie2/Makefile
index e34c97c2b64e4a3d9d15d1b7791002bcad7197cd..4ec4d51a39cdf5450329fdbfd54335faaa144de8 100755
--- a/TPs/TP1/CODE/Partie2/Makefile
+++ b/TPs/TP1/CODE/Partie2/Makefile
@@ -23,7 +23,7 @@ run_cpu: $(TARGET)
 	./$(TARGET) 2000
 
 run_gpu: $(TARGET)
-	./$(TARGETGPU) 1024
+	./$(TARGETGPU) 2048
 
 proper : 
 	rm -f *.o
diff --git a/TPs/TP1/CODE/Partie2/dgemm.cu b/TPs/TP1/CODE/Partie2/dgemm.cu
index eaea2550af035b224b79e5f74b6ea30e88239446..d31c37d6e137765420577c3c4e7ca991696eef9e 100755
--- a/TPs/TP1/CODE/Partie2/dgemm.cu
+++ b/TPs/TP1/CODE/Partie2/dgemm.cu
@@ -7,6 +7,8 @@
 #define gettime(t) clock_gettime(CLOCK_MONOTONIC_RAW, t)
 #define get_sub_seconde(t) (1e-9*(double)t.tv_nsec)
 
+#define BLOCK_SIZE 32
+
 #include "helper_cuda.h"
 
 /** return time in second
@@ -87,8 +89,8 @@ __global__ void multk(double *A, double *B, double *C, int N)
 
 __global__ void multk2(double *A, double *B, double *C, int N)
 {
-	const int i = blockIdx.y * blockDim.y + threadIdx.y;
 	const int j = blockIdx.x * blockDim.x + threadIdx.x;
+	const int i = blockIdx.y * blockDim.y + threadIdx.y;
 	if (i >= N || j >= N)
 		return;
 	
@@ -102,14 +104,14 @@ __global__ void multk2(double *A, double *B, double *C, int N)
 
 __global__ void multk3(double *A, double *B, double *C, int N)
 {
-	const int blockSize = 32;
-	__shared__ double A_s[32 * 32];
-	__shared__ double B_s[32 * 32];
+	const int blockSize = BLOCK_SIZE;
+	__shared__ double A_s[BLOCK_SIZE * BLOCK_SIZE];
+	__shared__ double B_s[BLOCK_SIZE * BLOCK_SIZE];
 
-	const int i = blockIdx.y * blockSize + threadIdx.y;
-	const int j = blockIdx.x * blockSize + threadIdx.x;
+	const int x = blockIdx.x * blockSize + threadIdx.x;
+	const int y = blockIdx.y * blockSize + threadIdx.y;
 
-	if (i >= N || j >= N)
+	if (x >= N || y >= N)
 		return;
 
 	double sum = 0;
@@ -118,9 +120,9 @@ __global__ void multk3(double *A, double *B, double *C, int N)
 		int k = z + threadIdx.x;
 		int l = z + threadIdx.y;
 		if (k < N)
-			A_s[threadIdx.y * blockDim.x + threadIdx.x] = A[i * N + k];
+			A_s[threadIdx.y * blockDim.x + threadIdx.x] = A[y * N + k];
 		if (l < N)
-			B_s[threadIdx.y * blockDim.x + threadIdx.x] = B[l * N + j];
+			B_s[threadIdx.y * blockDim.x + threadIdx.x] = B[l * N + x];
 		__syncthreads();
 
 		for (int m = 0; m < blockSize; ++m){
@@ -131,7 +133,40 @@ __global__ void multk3(double *A, double *B, double *C, int N)
 		__syncthreads();
 	}
 
-	C[i * N + j] = sum;
+	C[y * N + x] = sum;
+}
+
+__global__ void d_shared_mult(double* A, double* B, double* C, int size)
+{
+  __shared__ double s_A[1024];
+  __shared__ double s_B[1024];
+
+  unsigned x = blockDim.x*blockIdx.x + threadIdx.x;
+  unsigned y = blockDim.y*blockIdx.y + threadIdx.y;
+  //pos in memory is x + y*size
+
+  int nsize = size/blockDim.x; //we suppose it is squared
+  double sum = 0.0;
+
+  for (int i = 0; i < nsize; i++)
+  {
+    //LOADING IN SHARED MEMORY
+    s_A[threadIdx.x + threadIdx.y*blockDim.x]
+      = A[(i*blockDim.x + blockIdx.y*gridDim.x*blockDim.x*blockDim.y)
+          +(threadIdx.x + threadIdx.y*blockDim.x*gridDim.x)];
+
+    s_B[threadIdx.x + threadIdx.y*blockDim.x]
+      = B[(blockIdx.x*blockDim.x + i*gridDim.x*blockDim.x*blockDim.y)
+          +(threadIdx.x + threadIdx.y*blockDim.x*gridDim.x)];
+
+    __syncthreads();
+    for (int j = 0; j < blockDim.x; j++)
+    {
+      sum += s_A[j + threadIdx.y*blockDim.x] * s_B[threadIdx.x + j*blockDim.x];
+    }
+    __syncthreads();
+  }
+  C[x + y*size] = sum;
 }
 
 int main(int argc, char** argv){
@@ -148,7 +183,7 @@ int main(int argc, char** argv){
 
 	double t0 = 0., t1 = 0., duration = 0.;
 
-	N = (argc < 2) ? 1000 : atoi(argv[1]);
+	N = (argc < 2) ? 1024 : atoi(argv[1]);
 	fprintf(stdout, "Matrix Multiplication\n	Size: %dx%d\n", N, N);
 
 	// Memory allocation
@@ -168,76 +203,98 @@ int main(int argc, char** argv){
 	checkCudaErrors(cudaMemcpy(B_d, B_h,
 				sizeof(double) * N * N, cudaMemcpyHostToDevice));
 
-	// Compute multiplication
-	
+
+
+	uint64_t N_u64 = (uint64_t) N;
+	uint64_t nb_op = N_u64 * N_u64 * N_u64;
+
+
+
+	// CPU Compute multiplication	
 	t0 = get_elapsedtime();
 	mult(A_h, B_h, C_cpu, N);
 	t1 = get_elapsedtime();
 
-	cudaEvent_t ct[6];
-
-	for(int i = 0; i < 6; ++i)
-		cudaEventCreate(&ct[i]);
+	duration = (t1 - t0);
 	
-	float cudaDuration[3];
+	fprintf(stdout, "CPU Performance results: \n");
+	fprintf(stdout, "  Time: %lf s\n", duration);
+	fprintf(stdout, "  MFlops: %.2f\n", (nb_op / duration)*1E-6);
 
-	int blockSize = 32;
+
+
+	// CUDA PREP
+	cudaEvent_t ct0;
+	cudaEvent_t ct1;
+	cudaEventCreate(&ct0);
+	cudaEventCreate(&ct1);
+	float cudaDuration;
+
+	int blockSize = BLOCK_SIZE;
 	dim3 dimBlock(blockSize, blockSize, 1);
 	dim3 dimGrid((N / blockSize) + 1, (N / blockSize) + 1, 1);
 	//int sharedMem = blockSize * blockSize * 2 * sizeof(double);
 
-	cudaEventRecord(ct[0]);
+
+
+	cudaEventRecord(ct0);
 	multk<<<dimGrid, dimBlock>>>(A_d, B_d, C_d, N);
-	cudaEventRecord(ct[1]);
+	cudaEventRecord(ct1);
 	cudaDeviceSynchronize();
 	getLastCudaError("Mult kernel 1 failed");
+	cudaEventElapsedTime(&cudaDuration, ct0, ct1);
+	checkCudaErrors(cudaMemcpy(C_h, C_d,
+				sizeof(double) * N * N, cudaMemcpyDeviceToHost));
+	fprintf(stdout, "GPU Performance results 1: \n");
+	fprintf(stdout, "  Time: %lf s\n", cudaDuration / 1000);
+	fprintf(stdout, "  MFlops: %.2f\n", (nb_op / cudaDuration) * 1E-3);
+	verify_matrix(C_cpu, C_h, N);
 
-	
-	cudaEventRecord(ct[2]);
+
+
+	cudaEventRecord(ct0);
 	multk2<<<dimGrid, dimBlock>>>(A_d, B_d, C_d, N);
-	cudaEventRecord(ct[3]);
+	cudaEventRecord(ct1);
 	cudaDeviceSynchronize();
 	getLastCudaError("Mult kernel 2 failed");
+	cudaEventElapsedTime(&cudaDuration, ct0, ct1);
+	checkCudaErrors(cudaMemcpy(C_h, C_d,
+				sizeof(double) * N * N, cudaMemcpyDeviceToHost));
+	fprintf(stdout, "GPU Performance results 2: \n");
+	fprintf(stdout, "  Time: %lf s\n", cudaDuration / 1000);
+	fprintf(stdout, "  MFlops: %.2f\n", (nb_op / cudaDuration) * 1E-3);
+	verify_matrix(C_cpu, C_h, N);
 
 
-	cudaEventRecord(ct[4]);
+	cudaEventRecord(ct0);
 	//multk3<<<dimGrid, dimBlock, sharedMem>>>(A_d, B_d, C_d, N);
 	multk3<<<dimGrid, dimBlock>>>(A_d, B_d, C_d, N);
-	cudaEventRecord(ct[5]);
+	cudaEventRecord(ct1);
 	cudaDeviceSynchronize();
 	getLastCudaError("Mult kernel 3 failed");
-
-
-
-	cudaEventElapsedTime(&cudaDuration[0], ct[0], ct[1]);
-	cudaEventElapsedTime(&cudaDuration[1], ct[2], ct[3]);
-	cudaEventElapsedTime(&cudaDuration[2], ct[4], ct[5]);
-
-    checkCudaErrors(cudaMemcpy(C_h, C_d,
+	cudaEventElapsedTime(&cudaDuration, ct0, ct1);
+	checkCudaErrors(cudaMemcpy(C_h, C_d,
 				sizeof(double) * N * N, cudaMemcpyDeviceToHost));
-
+	fprintf(stdout, "GPU Performance results 3: \n");
+	fprintf(stdout, "  Time: %lf s\n", cudaDuration / 1000);
+	fprintf(stdout, "  MFlops: %.2f\n", (nb_op / cudaDuration) * 1E-3);
 	verify_matrix(C_cpu, C_h, N);
 
-	// Pretty print
-	duration = (t1 - t0);
-	uint64_t N_u64 = (uint64_t) N;
-	uint64_t nb_op = N_u64 * N_u64 * N_u64;
 
-	fprintf(stdout, "CPU Performance results: \n");
-	fprintf(stdout, "  Time: %lf s\n", duration);
-	fprintf(stdout, "  MFlops: %.2f\n", (nb_op / duration)*1E-6);
 
-	fprintf(stdout, "GPU Performance results 1: \n");
-	fprintf(stdout, "  Time: %lf s\n", cudaDuration[0] / 1000);
-	fprintf(stdout, "  MFlops: %.2f\n", (nb_op / cudaDuration[0]) * 1E-3);
-
-	fprintf(stdout, "GPU Performance results 2: \n");
-	fprintf(stdout, "  Time: %lf s\n", cudaDuration[1] / 1000);
-	fprintf(stdout, "  MFlops: %.2f\n", (nb_op / cudaDuration[1]) * 1E-3);
+	cudaEventRecord(ct0);
+	d_shared_mult<<<dimGrid, dimBlock>>>(A_d, B_d, C_d, N);
+	cudaEventRecord(ct1);
+	cudaDeviceSynchronize();
+	getLastCudaError("Mult kernel Anzo failed");
+	cudaEventElapsedTime(&cudaDuration, ct0, ct1);
+	checkCudaErrors(cudaMemcpy(C_h, C_d,
+				sizeof(double) * N * N, cudaMemcpyDeviceToHost));
 
-	fprintf(stdout, "GPU Performance results 3: \n");
-	fprintf(stdout, "  Time: %lf s\n", cudaDuration[2] / 1000);
-	fprintf(stdout, "  MFlops: %.2f\n", (nb_op / cudaDuration[2]) * 1E-3);
+	fprintf(stdout, "GPU Performance results Anzo: \n");
+	fprintf(stdout, "  Time: %lf s\n", cudaDuration / 1000);
+	fprintf(stdout, "  MFlops: %.2f\n", (nb_op / cudaDuration) * 1E-3);
+	verify_matrix(C_cpu, C_h, N);
 
 	return 0;
 }