From 3fcaefc89e4789a63f61baf04cbffda63273de57 Mon Sep 17 00:00:00 2001
From: "nicolas.marie" <nicolas.marie@ensiie.eu>
Date: Mon, 18 Dec 2023 18:51:10 +0100
Subject: [PATCH] TP4 partie2

---
 TPs/TP4/CODE/Partie2/helper_cuda.h  | 183 ++++++++++++++++++++++++++++
 TPs/TP4/CODE/Partie2/unified_mem.cu | 116 ++++++++++++++++++
 2 files changed, 299 insertions(+)
 create mode 100644 TPs/TP4/CODE/Partie2/helper_cuda.h
 create mode 100644 TPs/TP4/CODE/Partie2/unified_mem.cu

diff --git a/TPs/TP4/CODE/Partie2/helper_cuda.h b/TPs/TP4/CODE/Partie2/helper_cuda.h
new file mode 100644
index 0000000..3dd446d
--- /dev/null
+++ b/TPs/TP4/CODE/Partie2/helper_cuda.h
@@ -0,0 +1,183 @@
+/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+////////////////////////////////////////////////////////////////////////////////
+// These are CUDA Helper functions for initialization and error checking
+
+#ifndef COMMON_HELPER_CUDA_H_
+#define COMMON_HELPER_CUDA_H_
+
+#pragma once
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#ifndef MAX
+#define MAX(a, b) (a > b ? a : b)
+#endif
+
+#ifndef MIN
+#define MIN(a, b) (a < b ? a : b)
+#endif
+
+#define checkCudaErrors(val) check((val), #val, __FILE__, __LINE__)
+#define getLastCudaError(msg) __getLastCudaError(msg, __FILE__, __LINE__)
+
+void check(cudaError_t result, char const *const func, const char *const file,
+           int const line)
+{
+  if (result)
+  {
+    fprintf(stderr, "CUDA error at %s:%d code=%d (%s) \"%s\" \n", file, line, (int)result, cudaGetErrorName(result), func);
+    exit(EXIT_FAILURE);
+  }
+}
+
+inline void __getLastCudaError(const char *errorMessage, const char *file,
+                               const int line)
+{
+  cudaError_t err = cudaGetLastError();
+
+  if (cudaSuccess != err)
+  {
+    fprintf(stderr,
+            "%s(%i) : getLastCudaError() CUDA error :"
+            " %s : (%d) %s.\n",
+            file, line, errorMessage, (int)(err),
+            cudaGetErrorString(err));
+    exit(EXIT_FAILURE);
+  }
+}
+
+// Beginning of GPU Architecture definitions
+int _ConvertSMVer2Cores(int major, int minor)
+{
+  // Defines for GPU Architecture types (using the SM version to determine
+  // the # of cores per SM
+  typedef struct
+  {
+    int SM; // 0xMm (hexidecimal notation), M = SM Major version,
+    // and m = SM minor version
+    int Cores;
+  } sSMtoCores;
+
+  sSMtoCores nGpuArchCoresPerSM[] = {
+      {0x30, 192},
+      {0x32, 192},
+      {0x35, 192},
+      {0x37, 192},
+      {0x50, 128},
+      {0x52, 128},
+      {0x53, 128},
+      {0x60, 64},
+      {0x61, 128},
+      {0x62, 128},
+      {0x70, 64},
+      {0x72, 64},
+      {0x75, 64},
+      {0x80, 64},
+      {0x86, 128},
+      {0x87, 128},
+      {-1, -1}};
+
+  int index = 0;
+
+  while (nGpuArchCoresPerSM[index].SM != -1)
+  {
+    if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor))
+    {
+      return nGpuArchCoresPerSM[index].Cores;
+    }
+
+    index++;
+  }
+
+  // If we don't find the values, we default use the previous one
+  // to run properly
+  printf(
+      "MapSMtoCores for SM %d.%d is undefined."
+      "  Default to use %d Cores/SM\n",
+      major, minor, nGpuArchCoresPerSM[index - 1].Cores);
+  return nGpuArchCoresPerSM[index - 1].Cores;
+}
+
+inline const char *_ConvertSMVer2ArchName(int major, int minor)
+{
+  // Defines for GPU Architecture types (using the SM version to determine
+  // the GPU Arch name)
+  typedef struct
+  {
+    int SM; // 0xMm (hexidecimal notation), M = SM Major version,
+    // and m = SM minor version
+    const char *name;
+  } sSMtoArchName;
+
+  sSMtoArchName nGpuArchNameSM[] = {
+      {0x30, "Kepler"},
+      {0x32, "Kepler"},
+      {0x35, "Kepler"},
+      {0x37, "Kepler"},
+      {0x50, "Maxwell"},
+      {0x52, "Maxwell"},
+      {0x53, "Maxwell"},
+      {0x60, "Pascal"},
+      {0x61, "Pascal"},
+      {0x62, "Pascal"},
+      {0x70, "Volta"},
+      {0x72, "Xavier"},
+      {0x75, "Turing"},
+      {0x80, "Ampere"},
+      {0x86, "Ampere"},
+      {-1, "Graphics Device"}};
+
+  int index = 0;
+
+  while (nGpuArchNameSM[index].SM != -1)
+  {
+    if (nGpuArchNameSM[index].SM == ((major << 4) + minor))
+    {
+      return nGpuArchNameSM[index].name;
+    }
+
+    index++;
+  }
+
+  // If we don't find the values, we default use the previous one
+  // to run properly
+  printf(
+      "MapSMtoArchName for SM %d.%d is undefined."
+      "  Default to use %s\n",
+      major, minor, nGpuArchNameSM[index - 1].name);
+  return nGpuArchNameSM[index - 1].name;
+}
+// end of GPU Architecture definitions
+
+// end of CUDA Helper Functions
+
+#endif // COMMON_HELPER_CUDA_H_
\ No newline at end of file
diff --git a/TPs/TP4/CODE/Partie2/unified_mem.cu b/TPs/TP4/CODE/Partie2/unified_mem.cu
new file mode 100644
index 0000000..a44e601
--- /dev/null
+++ b/TPs/TP4/CODE/Partie2/unified_mem.cu
@@ -0,0 +1,116 @@
+#include <stdio.h>
+
+#include "helper_cuda.h"
+
+double get_time(void)
+{
+	struct timespec ts;
+	clock_gettime(CLOCK_MONOTONIC_RAW, &ts);
+	return (double) ts.tv_sec + (1e-9 * (double) ts.tv_nsec);
+}
+
+
+__global__ void scal_mul(float *A, float *B, float *C, size_t size){
+	size_t i = blockDim.x * blockIdx.x + threadIdx.x;
+	if (i >= size)
+		return;
+	C[i] = A[i] * B[i];
+}
+
+int main(){
+
+	int size = 102300000;
+
+	size_t blockSize = 1024;
+	dim3 dimBlock(blockSize, 1, 1);
+	dim3 dimGrid(size / blockSize + 1, 1, 1);
+
+
+	scal_mul<<<dimGrid, dimBlock>>>(NULL, NULL, NULL, 0);
+
+	double t1;
+	double t2;
+
+	t1 = get_time();
+
+	int sum1;
+
+	float *h_A = NULL;
+	float *h_B = NULL;
+	float *h_C = NULL;
+	float *d_A = NULL;
+	float *d_B = NULL;
+	float *d_C = NULL;
+
+	checkCudaErrors(cudaMallocHost(&h_A, sizeof(float) * size));
+	checkCudaErrors(cudaMallocHost(&h_B, sizeof(float) * size));
+	checkCudaErrors(cudaMallocHost(&h_C, sizeof(float) * size));
+	checkCudaErrors(cudaMalloc(&d_A, sizeof(float) * size));
+	checkCudaErrors(cudaMalloc(&d_B, sizeof(float) * size));
+	checkCudaErrors(cudaMalloc(&d_C, sizeof(float) * size));
+	
+	for(size_t i = 0; i < size; ++i) {
+		h_A[i] = 1;
+		h_B[i] = 1;
+	}
+
+	checkCudaErrors(cudaMemcpy(d_A, h_A, sizeof(float) * size, cudaMemcpyHostToDevice));
+	checkCudaErrors(cudaMemcpy(d_B, h_B, sizeof(float) * size, cudaMemcpyHostToDevice));
+
+	scal_mul<<<dimGrid, dimBlock>>>(d_A, d_B, d_C, size);
+	checkCudaErrors(cudaDeviceSynchronize());
+	
+
+	checkCudaErrors(cudaMemcpy(h_C, d_C, sizeof(float) * size, cudaMemcpyDeviceToHost));
+
+	sum1 = 0;
+	for(size_t i = 0; i < size; ++i) {
+		sum1 += h_C[i];
+	}
+
+	checkCudaErrors(cudaFree(d_A));
+	checkCudaErrors(cudaFree(d_B));
+	checkCudaErrors(cudaFree(d_C));
+	checkCudaErrors(cudaFreeHost(h_A));
+	checkCudaErrors(cudaFreeHost(h_B));
+	checkCudaErrors(cudaFreeHost(h_C));
+
+	t2 = get_time();
+	printf("Normal CUDA: %d in %g[s]\n", sum1, t2 - t1);
+
+
+
+
+
+	t1 = get_time();
+
+	int sum2;
+
+	float *u_A = NULL;
+	float *u_B = NULL;
+	float *u_C = NULL;
+	checkCudaErrors(cudaMallocManaged(&u_A, sizeof(float) * size));
+	checkCudaErrors(cudaMallocManaged(&u_B, sizeof(float) * size));
+	checkCudaErrors(cudaMallocManaged(&u_C, sizeof(float) * size));
+
+	for(size_t i = 0; i < size; ++i) {
+		u_A[i] = 1;
+		u_B[i] = 1;
+	}
+
+	scal_mul<<<dimGrid, dimBlock>>>(u_A, u_B, u_C, size);
+	checkCudaErrors(cudaDeviceSynchronize());
+
+	sum2 = 0;
+	for(size_t i = 0; i < size; ++i) {
+		sum2 += u_C[i];
+	}
+
+	checkCudaErrors(cudaFree(u_A));
+	checkCudaErrors(cudaFree(u_B));
+	checkCudaErrors(cudaFree(u_C));
+
+	t2 = get_time();
+	printf("Unified Mem CUDA: %d in %g[s]\n", sum2, t2 - t1);
+
+}
-- 
GitLab