done TP1

876c3c4e · Nicolas MARIE · 9e24fea4 · 876c3c4e · 876c3c4e · 876c3c4e
--- a/TPs/TP1/CODE/Partie2/Makefile
+++ b/TPs/TP1/CODE/Partie2/Makefile
 CC=gcc
-CFLAGS=-O3 -Wall
-EXE=mult.exe
+NVCC=nvcc
+CFLAGS=-O3 -Wall -Wextra
+NVFLAGS=-O3 -I .
+TARGET=mult_cpu
+TARGETGPU=mult_gpu

-all : $(EXE)
+all: $(TARGET) $(TARGETGPU)

-$(EXE) : dgemm.o
+$(TARGET): dgemm_cpu.o
 	$(CC) $(CFLAGS) -o $@ $<

-%.o : %.c
+$(TARGETGPU): dgemm_gpu.o
+	$(NVCC) $(NVFLAGS) -o $@ $<
+
+%_cpu.o: %.c
 	$(CC) $(CFLAGS) -c -o $@ $<

-run : mult.exe
-	./$< 1000
+%_gpu.o: %.cu
+	$(NVCC) $(NVFLAGS) -c -o $@ $<
+
+run_cpu: $(TARGET)
+	./$(TARGET) 2000
+
+run_gpu: $(TARGET)
+	./$(TARGETGPU) 1024

 proper : 
 	rm -f *.o

 clean :
-	rm -f *.o $(EXE)
+	rm -f *.o $(TARGET) $(TARGETGPU)
--- a/TPs/TP1/CODE/Partie2/dgemm.c
+++ b/TPs/TP1/CODE/Partie2/dgemm.c
@@ -23,7 +23,7 @@ int verify_matrix(double *matRef, double *matOut, int N) {
  for (i = 0; i < size; i++) {
    diff = fabs(matRef[i] - matOut[i]);
    if (diff > 0.01) {
-      printf("Divergence! Should %5.2f, Is %5.2f (Diff %5.2f) at %d\n",
+      printf("Divergence! Should %5.2f, Is %5.2f (Diff %5.2f) at %ld\n",
             matRef[i], matOut[i], diff, i);
      return 1;
    }

--- a/TPs/TP1/CODE/Partie2/dgemm.cu
+++ b/TPs/TP1/CODE/Partie2/dgemm.cu
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+#include <inttypes.h>
+#include <math.h>
+
+#define gettime(t) clock_gettime(CLOCK_MONOTONIC_RAW, t)
+#define get_sub_seconde(t) (1e-9*(double)t.tv_nsec)
+
+#include "helper_cuda.h"
+
+/** return time in second
+*/
+double get_elapsedtime(void)
+{
+	struct timespec st;
+	int err = gettime(&st);
+	if (err !=0) return 0;
+	return (double)st.tv_sec + get_sub_seconde(st);
+}
+
+int verify_matrix(double *matRef, double *matOut, int N) {
+	double diff = 0.0;
+	uint64_t i;
+	uint64_t size = N*N;
+	for (i = 0; i < size; i++) {
+		diff = fabs(matRef[i] - matOut[i]);
+		if (diff > 0.01) {
+			printf("Divergence! Should %5.2f, Is %5.2f (Diff %5.2f) at %ld\n",
+						 matRef[i], matOut[i], diff, i);
+			return 1;
+		}
+	}
+	return 0;
+}
+
+
+void init(double* A, double* B, double* C, int size)
+{
+	int i = 0, j = 0;
+
+	srand(2019);
+
+	for(i = 0; i < size; i++)
+	{
+		for(j = 0; j < size; j++)
+		{
+			A[i * size + j] = (double) (rand() % 10) + 0.01 * (rand() % 5);
+			B[i * size + j] = (double) (rand() % 10) + 0.01 * (rand() % 5);
+			C[i * size + j] = 0.0;
+		}
+	}
+}
+
+void mult(double* A, double* B, double* C, int size)
+{
+	int i = 0, j = 0, k = 0;
+
+	for(i = 0; i < size; i++)
+	{
+		for(j = 0; j < size; j++)
+		{
+			double sum = 0.;
+			for(k = 0; k < size; k++)
+			{
+				sum += A[i * size + k] * B[k * size + j];
+			}
+			C[i * size + j] = sum;
+		}
+	}
+}
+
+__global__ void multk(double *A, double *B, double *C, int N)
+{
+	const int i = blockIdx.x * blockDim.x + threadIdx.x;
+	const int j = blockIdx.y * blockDim.y + threadIdx.y;
+	if (i >= N || j >= N)
+		return;
+	
+	double sum = 0.;
+	for (int k = 0; k < N; ++k)
+	{
+		sum += A[i * N + k] * B[k * N + j];
+	}
+	C[i * N + j] = sum;
+}
+
+__global__ void multk2(double *A, double *B, double *C, int N)
+{
+	const int i = blockIdx.y * blockDim.y + threadIdx.y;
+	const int j = blockIdx.x * blockDim.x + threadIdx.x;
+	if (i >= N || j >= N)
+		return;
+	
+	double sum = 0.;
+	for (int k = 0; k < N; ++k)
+	{
+		sum += A[i * N + k] * B[k * N + j];
+	}
+	C[i * N + j] = sum;
+}
+
+__global__ void multk3(double *A, double *B, double *C, int N)
+{
+	const int blockSize = 32;
+	__shared__ double A_s[32 * 32];
+	__shared__ double B_s[32 * 32];
+
+	const int i = blockIdx.y * blockSize + threadIdx.y;
+	const int j = blockIdx.x * blockSize + threadIdx.x;
+
+	if (i >= N || j >= N)
+		return;
+
+	double sum = 0;
+	for (int z = 0; z < N; z += blockSize)
+	{
+		int k = z + threadIdx.x;
+		int l = z + threadIdx.y;
+		if (k < N)
+			A_s[threadIdx.y * blockDim.x + threadIdx.x] = A[i * N + k];
+		if (l < N)
+			B_s[threadIdx.y * blockDim.x + threadIdx.x] = B[l * N + j];
+		__syncthreads();
+
+		for (int m = 0; m < blockSize; ++m){
+			if (z + m < N)
+				sum += A_s[threadIdx.y * blockSize + m]
+					* B_s[m * blockSize + threadIdx.x];
+		}
+		__syncthreads();
+	}
+
+	C[i * N + j] = sum;
+}
+
+int main(int argc, char** argv){
+	int N = 0;
+
+	double *A_h = NULL;
+	double *B_h = NULL;
+	double *C_h = NULL;
+	double *C_cpu = NULL;
+
+	double *A_d = NULL;
+	double *B_d = NULL;
+	double *C_d = NULL;
+
+	double t0 = 0., t1 = 0., duration = 0.;
+
+	N = (argc < 2) ? 1000 : atoi(argv[1]);
+	fprintf(stdout, "Matrix Multiplication\n	Size: %dx%d\n", N, N);
+
+	// Memory allocation
+	A_h = (double*) malloc(sizeof(double) * N * N);
+	B_h = (double*) malloc(sizeof(double) * N * N);
+	C_h = (double*) malloc(sizeof(double) * N * N);
+	C_cpu = (double*) malloc(sizeof(double) * N * N);
+
+	checkCudaErrors(cudaMalloc((void **) &A_d, sizeof(double) * N * N));
+	checkCudaErrors(cudaMalloc((void **) &B_d, sizeof(double) * N * N));
+	checkCudaErrors(cudaMalloc((void **) &C_d, sizeof(double) * N * N));
+
+	// Value initialization
+	init(A_h, B_h, C_h, N);
+	checkCudaErrors(cudaMemcpy(A_d, A_h,
+				sizeof(double) * N * N, cudaMemcpyHostToDevice));
+	checkCudaErrors(cudaMemcpy(B_d, B_h,
+				sizeof(double) * N * N, cudaMemcpyHostToDevice));
+
+	// Compute multiplication
+	
+	t0 = get_elapsedtime();
+	mult(A_h, B_h, C_cpu, N);
+	t1 = get_elapsedtime();
+
+	cudaEvent_t ct[6];
+
+	for(int i = 0; i < 6; ++i)
+		cudaEventCreate(&ct[i]);
+	
+	float cudaDuration[3];
+
+	int blockSize = 32;
+	dim3 dimBlock(blockSize, blockSize, 1);
+	dim3 dimGrid((N / blockSize) + 1, (N / blockSize) + 1, 1);
+	//int sharedMem = blockSize * blockSize * 2 * sizeof(double);
+
+	cudaEventRecord(ct[0]);
+	multk<<<dimGrid, dimBlock>>>(A_d, B_d, C_d, N);
+	cudaEventRecord(ct[1]);
+	cudaDeviceSynchronize();
+	getLastCudaError("Mult kernel 1 failed");
+
+	
+	cudaEventRecord(ct[2]);
+	multk2<<<dimGrid, dimBlock>>>(A_d, B_d, C_d, N);
+	cudaEventRecord(ct[3]);
+	cudaDeviceSynchronize();
+	getLastCudaError("Mult kernel 2 failed");
+
+
+	cudaEventRecord(ct[4]);
+	//multk3<<<dimGrid, dimBlock, sharedMem>>>(A_d, B_d, C_d, N);
+	multk3<<<dimGrid, dimBlock>>>(A_d, B_d, C_d, N);
+	cudaEventRecord(ct[5]);
+	cudaDeviceSynchronize();
+	getLastCudaError("Mult kernel 3 failed");
+
+
+
+	cudaEventElapsedTime(&cudaDuration[0], ct[0], ct[1]);
+	cudaEventElapsedTime(&cudaDuration[1], ct[2], ct[3]);
+	cudaEventElapsedTime(&cudaDuration[2], ct[4], ct[5]);
+
+    checkCudaErrors(cudaMemcpy(C_h, C_d,
+				sizeof(double) * N * N, cudaMemcpyDeviceToHost));
+
+	verify_matrix(C_cpu, C_h, N);
+
+	// Pretty print
+	duration = (t1 - t0);
+	uint64_t N_u64 = (uint64_t) N;
+	uint64_t nb_op = N_u64 * N_u64 * N_u64;
+
+	fprintf(stdout, "CPU Performance results: \n");
+	fprintf(stdout, "  Time: %lf s\n", duration);
+	fprintf(stdout, "  MFlops: %.2f\n", (nb_op / duration)*1E-6);
+
+	fprintf(stdout, "GPU Performance results 1: \n");
+	fprintf(stdout, "  Time: %lf s\n", cudaDuration[0] / 1000);
+	fprintf(stdout, "  MFlops: %.2f\n", (nb_op / cudaDuration[0]) * 1E-3);
+
+	fprintf(stdout, "GPU Performance results 2: \n");
+	fprintf(stdout, "  Time: %lf s\n", cudaDuration[1] / 1000);
+	fprintf(stdout, "  MFlops: %.2f\n", (nb_op / cudaDuration[1]) * 1E-3);
+
+	fprintf(stdout, "GPU Performance results 3: \n");
+	fprintf(stdout, "  Time: %lf s\n", cudaDuration[2] / 1000);
+	fprintf(stdout, "  MFlops: %.2f\n", (nb_op / cudaDuration[2]) * 1E-3);
+
+	return 0;
+}
--- a/TPs/TP1/CODE/Partie2/helper_cuda.h
+++ b/TPs/TP1/CODE/Partie2/helper_cuda.h
+/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+////////////////////////////////////////////////////////////////////////////////
+// These are CUDA Helper functions for initialization and error checking
+
+#ifndef COMMON_HELPER_CUDA_H_
+#define COMMON_HELPER_CUDA_H_
+
+#pragma once
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#ifndef MAX
+#define MAX(a, b) (a > b ? a : b)
+#endif
+
+#ifndef MIN
+#define MIN(a, b) (a < b ? a : b)
+#endif
+
+#define checkCudaErrors(val) check((val), #val, __FILE__, __LINE__)
+#define getLastCudaError(msg) __getLastCudaError(msg, __FILE__, __LINE__)
+
+void check(cudaError_t result, char const *const func, const char *const file,
+           int const line)
+{
+  if (result)
+  {
+    fprintf(stderr, "CUDA error at %s:%d code=%d (%s) \"%s\" \n", file, line, (int)result, cudaGetErrorName(result), func);
+    exit(EXIT_FAILURE);
+  }
+}
+
+inline void __getLastCudaError(const char *errorMessage, const char *file,
+                               const int line)
+{
+  cudaError_t err = cudaGetLastError();
+
+  if (cudaSuccess != err)
+  {
+    fprintf(stderr,
+            "%s(%i) : getLastCudaError() CUDA error :"
+            " %s : (%d) %s.\n",
+            file, line, errorMessage, (int)(err),
+            cudaGetErrorString(err));
+    exit(EXIT_FAILURE);
+  }
+}
+
+// Beginning of GPU Architecture definitions
+int _ConvertSMVer2Cores(int major, int minor)
+{
+  // Defines for GPU Architecture types (using the SM version to determine
+  // the # of cores per SM
+  typedef struct
+  {
+    int SM; // 0xMm (hexidecimal notation), M = SM Major version,
+    // and m = SM minor version
+    int Cores;
+  } sSMtoCores;
+
+  sSMtoCores nGpuArchCoresPerSM[] = {
+      {0x30, 192},
+      {0x32, 192},
+      {0x35, 192},
+      {0x37, 192},
+      {0x50, 128},
+      {0x52, 128},
+      {0x53, 128},
+      {0x60, 64},
+      {0x61, 128},
+      {0x62, 128},
+      {0x70, 64},
+      {0x72, 64},
+      {0x75, 64},
+      {0x80, 64},
+      {0x86, 128},
+      {0x87, 128},
+      {-1, -1}};
+
+  int index = 0;
+
+  while (nGpuArchCoresPerSM[index].SM != -1)
+  {
+    if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor))
+    {
+      return nGpuArchCoresPerSM[index].Cores;
+    }
+
+    index++;
+  }
+
+  // If we don't find the values, we default use the previous one
+  // to run properly
+  printf(
+      "MapSMtoCores for SM %d.%d is undefined."
+      "  Default to use %d Cores/SM\n",
+      major, minor, nGpuArchCoresPerSM[index - 1].Cores);
+  return nGpuArchCoresPerSM[index - 1].Cores;
+}
+
+inline const char *_ConvertSMVer2ArchName(int major, int minor)
+{
+  // Defines for GPU Architecture types (using the SM version to determine
+  // the GPU Arch name)
+  typedef struct
+  {
+    int SM; // 0xMm (hexidecimal notation), M = SM Major version,
+    // and m = SM minor version
+    const char *name;
+  } sSMtoArchName;
+
+  sSMtoArchName nGpuArchNameSM[] = {
+      {0x30, "Kepler"},
+      {0x32, "Kepler"},
+      {0x35, "Kepler"},
+      {0x37, "Kepler"},
+      {0x50, "Maxwell"},
+      {0x52, "Maxwell"},
+      {0x53, "Maxwell"},
+      {0x60, "Pascal"},
+      {0x61, "Pascal"},
+      {0x62, "Pascal"},
+      {0x70, "Volta"},
+      {0x72, "Xavier"},
+      {0x75, "Turing"},
+      {0x80, "Ampere"},
+      {0x86, "Ampere"},
+      {-1, "Graphics Device"}};
+
+  int index = 0;
+
+  while (nGpuArchNameSM[index].SM != -1)
+  {
+    if (nGpuArchNameSM[index].SM == ((major << 4) + minor))
+    {
+      return nGpuArchNameSM[index].name;
+    }
+
+    index++;
+  }
+
+  // If we don't find the values, we default use the previous one
+  // to run properly
+  printf(
+      "MapSMtoArchName for SM %d.%d is undefined."
+      "  Default to use %s\n",
+      major, minor, nGpuArchNameSM[index - 1].name);
+  return nGpuArchNameSM[index - 1].name;
+}
+// end of GPU Architecture definitions
+
+// end of CUDA Helper Functions
+
+#endif // COMMON_HELPER_CUDA_H_
\ No newline at end of file
--- a/TPs/TP1/tp1.md
+++ b/TPs/TP1/tp1.md
+
+Q.13
+
+Op = 4096 * 4096 * 4096 = 68719476736 = 69 Gfp32
+Vr = 4096 * 4096 * 2 * 4 = 134217728 = 134 Mo
+Vw = 4096 * 4096 * 4 = 67108864 = 67 Mo
+
+
+
+mem théorique GPU: 56 Go/s
+flop théorique: 2273 Gfp32
+
+V = Vr + Vw = 134 + 67 = 201 Mo
+Tdata = 201 / 56 = 3.589 ms
+Tcompute = 68 / 2.273 = 29.92 ms
+rho = 29.92 / 3.59 ) 8.334
+
+Le plus gros du temps est passer sur les calculs.
+
+Q.14
+
+Performance réel: 9769.44 Mfp32
+
+9.76944 / 2273 = 0.0043 = 0.43 % d'utilisation maximal des performances de calcules.
+
+Q.15
+
+Op = 4096 * 4096 * 4096 = 68719476736 = 69 Gfp32
+Vr = 4096 * 4096 * 4096 * 2 * 4 = 549755813888 = 550 Go
+Vw = 4096 * 4096 * 4 = 67108864 = 67 Mo
+
+Le trafique mémoire au pire est de 550 Go.
+
+
+
+
+