From 08504bd824f249dac5a365f19692f40d3627e1ae Mon Sep 17 00:00:00 2001
From: "nicolas.marie" <nicolas.marie@ensiie.eu>
Date: Thu, 25 Jan 2024 11:48:50 +0100
Subject: [PATCH] pgpu debut TP3

---
 TPs/TP3/CODE/AXPY/axpy.c    |  40 +++++++-------
 TPs/TP3/CODE/DGEMM/dgemm.c  | 102 +++++++++++++++++++-----------------
 TPs/TP3/CODE/ONGPU/Makefile |   2 +-
 TPs/TP3/CODE/ONGPU/ongpu.c  |  12 +++++
 4 files changed, 88 insertions(+), 68 deletions(-)

diff --git a/TPs/TP3/CODE/AXPY/axpy.c b/TPs/TP3/CODE/AXPY/axpy.c
index 2356954..bea8def 100755
--- a/TPs/TP3/CODE/AXPY/axpy.c
+++ b/TPs/TP3/CODE/AXPY/axpy.c
@@ -4,28 +4,30 @@
 
 int main(int argc, char** argv)
 {
-  double alpha = 2;
-  double *X = NULL, *Y = NULL;
-  int N = 1000;
-  if (argc > 1) N = atoi(argv[1]);
+	double alpha = 2;
+	double *X = NULL, *Y = NULL;
+	int N = 100000000;
+	if (argc > 1) N = atoi(argv[1]);
 
-  X = (double*) malloc(sizeof(double) * N);
-  Y = (double*) malloc(sizeof(double) * N);
+	X = (double*) malloc(sizeof(double) * N);
+	Y = (double*) malloc(sizeof(double) * N);
 
-  for(int i = 0; i < N; ++i)
-  {
-    X[i] = i;
-    Y[i] = X[i] + i;
-  }
+	for(int i = 0; i < N; ++i)
+	{
+		X[i] = i;
+		Y[i] = X[i] + i;
+	}
 
-  for(int i = 0; i < N; ++i)
-  {
-    Y[i] += alpha * X[i];
-  }
+	int runningOnGPU = 0;
+	#pragma omp target teams distribute parallel for map(to: X[:N]) map(tofrom: Y[:N]) map(from: runningOnGPU)
+	for(int i = 0; i < N; ++i)
+	{
+		Y[i] += alpha * X[i];
+	}
 
-  int stop = (N > 5)?5:N;
-  for(int i = 0; i < stop; ++i)
-    fprintf(stdout, "Y[%d] = %f\n", i, Y[i]);
+	int stop = (N > 5)?5:N;
+	for(int i = 0; i < stop; ++i)
+		fprintf(stdout, "Y[%d] = %f\n", i, Y[i]);
 
-  return 0;
+	return 0;
 }
diff --git a/TPs/TP3/CODE/DGEMM/dgemm.c b/TPs/TP3/CODE/DGEMM/dgemm.c
index 817910f..b663d00 100755
--- a/TPs/TP3/CODE/DGEMM/dgemm.c
+++ b/TPs/TP3/CODE/DGEMM/dgemm.c
@@ -10,72 +10,78 @@ typedef struct timespec struct_time;
 */
 double get_elapsedtime(void)
 {
-  struct_time st;
-  int err = gettime(&st);
-  if (err !=0) return 0;
-  return (double)st.tv_sec + get_sub_seconde(st);
+	struct_time st;
+	int err = gettime(&st);
+	if (err !=0) return 0;
+	return (double)st.tv_sec + get_sub_seconde(st);
 }
 
 void mult(int N, float *A, float* B, float* C)
 {
-  int i = 0, j = 0, k = 0;
-  int total_size = N*N;
-  for(i = 0; i < N; ++i)
-  {
-    for(j = 0; j < N; ++j)
-    {
-      float val = 0.;
-      for(k = 0; k < N; ++k)
-      {
-        val += A[i * N + k] * B[k * N + j];
-      }
-      C[i * N + j] = val;
-    }
-  }
+	int i = 0, j = 0, k = 0;
+	//int total_size = N*N;
+	#pragma omp target teams distribute parallel for map(to: A) map(to: B) map(from: C)
+	for(i = 0; i < N; ++i)
+	{
+		for(j = 0; j < N; ++j)
+		{
+			for(k = 0; k < N; ++k)
+			{
+				C[i * N + j] += A[i * N + k] * B[k * N + j];
+			}
+		}
+	}
+}
+
+[[clang::optnone]]
+void donotopti(float *C){
+	return;
 }
 
 int main(int argc, char** argv)
 {
-  double t0 = 0., t1 = 0., duration = 0.;
+	double t0 = 0., t1 = 0., duration = 0.;
 
-  float *A = NULL;
-  float *B = NULL;
-  float *C = NULL;
+	float *A = NULL;
+	float *B = NULL;
+	float *C = NULL;
 
-  int i = 0;
+	int i = 0;
 
-  int N = 1024;
-  if(argc > 1)
-  {
-    N = atoi(argv[1]);
-  }
+	int N = 1024;
+	if(argc > 1)
+	{
+		N = atoi(argv[1]);
+	}
 
-  fprintf(stdout, "> Matrix Multiplication Kernel...\n");
-  fprintf(stdout, "    Size: %dx%d\n", N, N);
+	fprintf(stdout, "> Matrix Multiplication Kernel...\n");
+	fprintf(stdout, "		 Size: %dx%d\n", N, N);
 #pragma omp parallel default(none) shared(stdout)
-  {
+	{
 #pragma omp single
-    fprintf(stdout, "    Running on %d threads\n", omp_get_num_threads());
-  }
+		fprintf(stdout, "		 Running on %d threads\n", omp_get_num_threads());
+	}
+
+	A = (float*) calloc(N * N, sizeof(float));
+	B = (float*) calloc(N * N, sizeof(float));
+	C = (float*) calloc(N * N, sizeof(float));
 
-  A = (float*) calloc(N * N, sizeof(float));
-  B = (float*) calloc(N * N, sizeof(float));
-  C = (float*) calloc(N * N, sizeof(float));
+	for(i = 0; i < N * N; ++i)
+	{
+		A[i] = 1. * i;
+		B[i] = N*N - (1. * i);
+		C[i] = 0.;
+	}
 
-  for(i = 0; i < N * N; ++i)
-  {
-    A[i] = 1. * i;
-    B[i] = N*N - (1. * i);
-    C[i] = 0.;
-  }
+	t0 = get_elapsedtime();
+	mult(N, A, B, C);
+	t1 = get_elapsedtime();
 
-  t0 = get_elapsedtime();
-  mult(N, A, B, C);
-  t1 = get_elapsedtime();
+	duration = (t1 - t0);
 
-  duration = (t1 - t0);
+	fprintf(stdout, "		 Elapsed Time : %f\n", duration);
 
-  fprintf(stdout, "    Elapsed Time : %f\n", duration);
+	donotopti(C);
 
-  return 0;
+	return 0;
 }
diff --git a/TPs/TP3/CODE/ONGPU/Makefile b/TPs/TP3/CODE/ONGPU/Makefile
index 76dd3a1..d57f904 100755
--- a/TPs/TP3/CODE/ONGPU/Makefile
+++ b/TPs/TP3/CODE/ONGPU/Makefile
@@ -1,4 +1,4 @@
-CC=clang -fopenmp -fopenmp-targets=nvptx64-nvidia-cuda -Xopenmp-target -march=sm_61
+CC=clang -fopenmp -fopenmp-targets=nvptx64-nvidia-cuda
 CFLAGS=-O3 -Wall
 LDFLAGS=-lm -lrt
 
diff --git a/TPs/TP3/CODE/ONGPU/ongpu.c b/TPs/TP3/CODE/ONGPU/ongpu.c
index 901ab15..d10f8f9 100755
--- a/TPs/TP3/CODE/ONGPU/ongpu.c
+++ b/TPs/TP3/CODE/ONGPU/ongpu.c
@@ -17,5 +17,17 @@ int main()
 	else
 		printf("### Unable to use the GPU, using CPU! ###\n");
 
+	printf("Nb. devices: %d\n", omp_get_num_devices());
+
+	int team = 0;
+#pragma omp target map(from: team)
+	{
+#pragma omp teams
+		{
+			team = omp_get_num_teams();
+		}
+	}
+
+	printf("Nb. teams: %d\n", team);
 	return 0;
 }
-- 
GitLab