TP2

bcde70ef · Nicolas MARIE · 08448aef · bcde70ef · bcde70ef · bcde70ef
--- a/TPs/TP2/CODE/Makefile
+++ b/TPs/TP2/CODE/Makefile
 CC=gcc
-CFLAGS=-O3 -Wall
+NVCC=nvcc
+CFLAGS=-O3 -Wall -Wextra
+NVFLAGS=-O3 -I . -Xcompiler="-Wall -Wextra -Werror -O3"
 LDFLAGS=-lm
-EXE=tp2.exe
+TARGET=tp2cpu
+TARGETGPU=tp2gpu

-all : $(EXE)
+all: $(TARGET) $(TARGETGPU)

-$(EXE) : tp2.o
+$(TARGET): tp2_cpu.o
 	$(CC) $(CFLAGS) -o $@ $< $(LDFLAGS) 

-%.o : %.c
+$(TARGETGPU): tp2_gpu.o
+	$(NVCC) $(NVFLAGS) -o $@ $< $(LDFLAGS) 
+
+%_cpu.o: %.c
 	$(CC) $(CFLAGS) -c -o $@ $<

-run : $(EXE)
-	./$< 10240
+%_gpu.o: %.cu
+	$(NVCC) $(NVFLAGS) -c -o $@ $<
+
+run: $(TARGET) $(TARGETGPU)
+	./$(TARGET) 10240
+	./$(TARGETGPU) 10240

 proper : 
 	rm -f *.o

 clean :
-	rm -f *.o $(EXE)
+	rm -f *.o $(TARGET) $(TARGETGPU)
--- a/TPs/TP2/CODE/helper_cuda.h
+++ b/TPs/TP2/CODE/helper_cuda.h
+/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+////////////////////////////////////////////////////////////////////////////////
+// These are CUDA Helper functions for initialization and error checking
+
+#ifndef COMMON_HELPER_CUDA_H_
+#define COMMON_HELPER_CUDA_H_
+
+#pragma once
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#ifndef MAX
+#define MAX(a, b) (a > b ? a : b)
+#endif
+
+#ifndef MIN
+#define MIN(a, b) (a < b ? a : b)
+#endif
+
+#define checkCudaErrors(val) check((val), #val, __FILE__, __LINE__)
+#define getLastCudaError(msg) __getLastCudaError(msg, __FILE__, __LINE__)
+
+void check(cudaError_t result, char const *const func, const char *const file,
+           int const line)
+{
+  if (result)
+  {
+    fprintf(stderr, "CUDA error at %s:%d code=%d (%s) \"%s\" \n", file, line, (int)result, cudaGetErrorName(result), func);
+    exit(EXIT_FAILURE);
+  }
+}
+
+inline void __getLastCudaError(const char *errorMessage, const char *file,
+                               const int line)
+{
+  cudaError_t err = cudaGetLastError();
+
+  if (cudaSuccess != err)
+  {
+    fprintf(stderr,
+            "%s(%i) : getLastCudaError() CUDA error :"
+            " %s : (%d) %s.\n",
+            file, line, errorMessage, (int)(err),
+            cudaGetErrorString(err));
+    exit(EXIT_FAILURE);
+  }
+}
+
+// Beginning of GPU Architecture definitions
+int _ConvertSMVer2Cores(int major, int minor)
+{
+  // Defines for GPU Architecture types (using the SM version to determine
+  // the # of cores per SM
+  typedef struct
+  {
+    int SM; // 0xMm (hexidecimal notation), M = SM Major version,
+    // and m = SM minor version
+    int Cores;
+  } sSMtoCores;
+
+  sSMtoCores nGpuArchCoresPerSM[] = {
+      {0x30, 192},
+      {0x32, 192},
+      {0x35, 192},
+      {0x37, 192},
+      {0x50, 128},
+      {0x52, 128},
+      {0x53, 128},
+      {0x60, 64},
+      {0x61, 128},
+      {0x62, 128},
+      {0x70, 64},
+      {0x72, 64},
+      {0x75, 64},
+      {0x80, 64},
+      {0x86, 128},
+      {0x87, 128},
+      {-1, -1}};
+
+  int index = 0;
+
+  while (nGpuArchCoresPerSM[index].SM != -1)
+  {
+    if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor))
+    {
+      return nGpuArchCoresPerSM[index].Cores;
+    }
+
+    index++;
+  }
+
+  // If we don't find the values, we default use the previous one
+  // to run properly
+  printf(
+      "MapSMtoCores for SM %d.%d is undefined."
+      "  Default to use %d Cores/SM\n",
+      major, minor, nGpuArchCoresPerSM[index - 1].Cores);
+  return nGpuArchCoresPerSM[index - 1].Cores;
+}
+
+inline const char *_ConvertSMVer2ArchName(int major, int minor)
+{
+  // Defines for GPU Architecture types (using the SM version to determine
+  // the GPU Arch name)
+  typedef struct
+  {
+    int SM; // 0xMm (hexidecimal notation), M = SM Major version,
+    // and m = SM minor version
+    const char *name;
+  } sSMtoArchName;
+
+  sSMtoArchName nGpuArchNameSM[] = {
+      {0x30, "Kepler"},
+      {0x32, "Kepler"},
+      {0x35, "Kepler"},
+      {0x37, "Kepler"},
+      {0x50, "Maxwell"},
+      {0x52, "Maxwell"},
+      {0x53, "Maxwell"},
+      {0x60, "Pascal"},
+      {0x61, "Pascal"},
+      {0x62, "Pascal"},
+      {0x70, "Volta"},
+      {0x72, "Xavier"},
+      {0x75, "Turing"},
+      {0x80, "Ampere"},
+      {0x86, "Ampere"},
+      {-1, "Graphics Device"}};
+
+  int index = 0;
+
+  while (nGpuArchNameSM[index].SM != -1)
+  {
+    if (nGpuArchNameSM[index].SM == ((major << 4) + minor))
+    {
+      return nGpuArchNameSM[index].name;
+    }
+
+    index++;
+  }
+
+  // If we don't find the values, we default use the previous one
+  // to run properly
+  printf(
+      "MapSMtoArchName for SM %d.%d is undefined."
+      "  Default to use %s\n",
+      major, minor, nGpuArchNameSM[index - 1].name);
+  return nGpuArchNameSM[index - 1].name;
+}
+// end of GPU Architecture definitions
+
+// end of CUDA Helper Functions
+
+#endif // COMMON_HELPER_CUDA_H_
\ No newline at end of file
--- a/TPs/TP2/CODE/tp2.cu
+++ b/TPs/TP2/CODE/tp2.cu
 #include <stdlib.h>
 #include <stdio.h>
 #include <math.h>
+#include <time.h>
+#include <inttypes.h>
+
+#include "helper_cuda.h"
+
+#define gettime(t) clock_gettime(CLOCK_MONOTONIC_RAW, t)
+#define get_sub_seconde(t) (1e-9*(double)t.tv_nsec)

 #define NSTREAMS 4

+double get_elapsedtime(void)
+{
+	struct timespec st;
+	int err = gettime(&st);
+	if (err !=0) return 0;
+	return (double)st.tv_sec + get_sub_seconde(st);
+}
+
 void verif(float *out, int sz)
 {
-  float err = 0.;
-
-  for(int i = 0 ; i < sz ; i++)
-  {
-    err += abs(out[i] - exp( - abs(sin(i * 1.0)) ));
-  }
-
-  if (err/sz < 1.e-4)
-  {
-	  fprintf(stdout, "TEST PASSED (error %3.f < 1.e-4)\n", err/sz);
-  }
-  else
-  {
-	  fprintf(stderr, "TEST FAILED (error %3.f > 1.e-4)\n", err/sz);
-  }
+	float err = 0.;
+
+	for(int i = 0 ; i < sz ; i++)
+	{
+		err += abs(out[i] - exp( - abs(sin(i * 1.0)) ));
+	}
+
+	if (err/sz < 1.e-4)
+	{
+		fprintf(stdout, "TEST PASSED (error %3.5f < 1.e-4)\n", err/sz);
+	}
+	else
+	{
+		fprintf(stderr, "TEST FAILED (error %3.5f > 1.e-4)\n", err/sz);
+	}
 }

 void func(float *out, int size)
 {
-  for(int i = 0; i < size; ++i)
-  {
-    out[i] = exp( - abs(out[i]) );
-  }
+	for(int i = 0; i < size; ++i)
+	{
+		out[i] = exp( - abs(out[i]) );
+	}
+}
+
+__global__ void funck(float *out, int size)
+{
+	int index = blockIdx.x * blockDim.x + threadIdx.x;
+	if (index >= size)
+		return;
+
+	out[index] = exp( - abs(out[index]));
 }

 int main(int argc, char** argv)
 {
-  int size = 1024;
-  if (argc == 2)
-  {
-	  size = atoi(argv[1]);
-  }
+	int size = 1024;
+	if (argc >= 2)
+	{
+		size = atoi(argv[1]);
+	}
+	int nstreams = NSTREAMS;
+	if (argc >= 3)
+	{
+		nstreams = atoi(argv[2]);
+	}
+
+	int subsize = size / nstreams;
+
+	cudaStream_t streams[nstreams];
+	for (int i = 0; i < nstreams; ++i)
+	{
+		checkCudaErrors(cudaStreamCreate(&streams[i]));
+	}
+
+	float *tab_h = NULL;
+	float *tab_d = NULL;
+
+	checkCudaErrors(cudaMallocHost((void **) &tab_h, sizeof(float) * size));
+	checkCudaErrors(cudaMalloc((void **) &tab_d, sizeof(float) * size));
+
+	for(int i = 0; i < size; ++i)
+	{
+		tab_h[i] = sin(i * 1.);
+	}
+
+	//func(tab_h, size);
+	//verif(tab_h, size);
+
+	int blocksize = 1024;
+	dim3 dimBlock(blocksize, 1, 1);
+	dim3 dimGrid((size / blocksize / nstreams) + 1, 1, 1);
+
+	double t0, t1;
+	cudaEvent_t ce[nstreams * 2];
+	for(int i = 0; i < nstreams * 2; ++i)
+	{
+		cudaEventCreate(&ce[i]);
+	}
+
+	t0 = get_elapsedtime();

-  size *= NSTREAMS;
+	for(int i = 0; i < nstreams; ++i)
+	{
+		checkCudaErrors(cudaMemcpyAsync(
+					tab_d + (subsize * i), tab_h + (subsize * i),
+					sizeof(float) * subsize,
+					cudaMemcpyHostToDevice, streams[i]));
+		cudaEventRecord(ce[i * 2]);
+		funck<<<dimGrid, dimBlock, 0, streams[i]>>>(
+				tab_d + (subsize * i), subsize);
+		cudaEventRecord(ce[i * 2 + 1]);
+		checkCudaErrors(cudaMemcpyAsync(
+					tab_h + (subsize * i), tab_d + (subsize * i),
+					sizeof(float) * subsize,
+					cudaMemcpyDeviceToHost, streams[i]));
+	}

-  float *tab = NULL;
-  tab = (float*) malloc(sizeof(float) * size);
+	cudaDeviceSynchronize();
+	getLastCudaError("funck kernel failed");

-  float d_tab;
-  cudaMalloc(
+	t1 = get_elapsedtime();

-  if(tab == NULL)
-  {
-    fprintf(stderr, "Bad allocation\n");
-    return -1;
-  }
+	fprintf(stdout, "Total Time (%d): %lf ms\n", nstreams, (t1 - t0) * 1000);
+	float cudaDuration, cudaTotalDuration = 0;
+	for(int i = 0; i < nstreams; ++i){
+		cudaEventElapsedTime(&cudaDuration, ce[i * 2], ce[i * 2 + 1]);
+		cudaTotalDuration += cudaDuration;
+		fprintf(stdout, "Time of stream %d: %lf ms\n", i, cudaDuration);
+	}
+	fprintf(stdout, "Total Time of streams: %lf ms\n", cudaTotalDuration);

-  for(int i = 0; i < size; ++i)
-  {
-    tab[i] = sin(i * 1.);
-  }
+	verif(tab_h, size);

-  func(tab, size);
+	checkCudaErrors(cudaFree(tab_d));
+	cudaFreeHost(tab_h);

-  verif(tab, size);
+	for (int i = 0; i < nstreams; ++i)
+	{
+		checkCudaErrors(cudaStreamDestroy(streams[i]));
+	}

-  free(tab);
-  return 0;
+	return 0;
 }