diff --git a/TPs/TP4/CODE/Partie2/helper_cuda.h b/TPs/TP4/CODE/Partie2/helper_cuda.h new file mode 100644 index 0000000000000000000000000000000000000000..3dd446dced85de9d10eb9f1597c56099fa358bdb --- /dev/null +++ b/TPs/TP4/CODE/Partie2/helper_cuda.h @@ -0,0 +1,183 @@ +/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of NVIDIA CORPORATION nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +//////////////////////////////////////////////////////////////////////////////// +// These are CUDA Helper functions for initialization and error checking + +#ifndef COMMON_HELPER_CUDA_H_ +#define COMMON_HELPER_CUDA_H_ + +#pragma once + +#include <stdint.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +#ifndef MAX +#define MAX(a, b) (a > b ? a : b) +#endif + +#ifndef MIN +#define MIN(a, b) (a < b ? a : b) +#endif + +#define checkCudaErrors(val) check((val), #val, __FILE__, __LINE__) +#define getLastCudaError(msg) __getLastCudaError(msg, __FILE__, __LINE__) + +void check(cudaError_t result, char const *const func, const char *const file, + int const line) +{ + if (result) + { + fprintf(stderr, "CUDA error at %s:%d code=%d (%s) \"%s\" \n", file, line, (int)result, cudaGetErrorName(result), func); + exit(EXIT_FAILURE); + } +} + +inline void __getLastCudaError(const char *errorMessage, const char *file, + const int line) +{ + cudaError_t err = cudaGetLastError(); + + if (cudaSuccess != err) + { + fprintf(stderr, + "%s(%i) : getLastCudaError() CUDA error :" + " %s : (%d) %s.\n", + file, line, errorMessage, (int)(err), + cudaGetErrorString(err)); + exit(EXIT_FAILURE); + } +} + +// Beginning of GPU Architecture definitions +int _ConvertSMVer2Cores(int major, int minor) +{ + // Defines for GPU Architecture types (using the SM version to determine + // the # of cores per SM + typedef struct + { + int SM; // 0xMm (hexidecimal notation), M = SM Major version, + // and m = SM minor version + int Cores; + } sSMtoCores; + + sSMtoCores nGpuArchCoresPerSM[] = { + {0x30, 192}, + {0x32, 192}, + {0x35, 192}, + {0x37, 192}, + {0x50, 128}, + {0x52, 128}, + {0x53, 128}, + {0x60, 64}, + {0x61, 128}, + {0x62, 128}, + {0x70, 64}, + {0x72, 64}, + {0x75, 64}, + {0x80, 64}, + {0x86, 128}, + {0x87, 128}, + {-1, -1}}; + + int index = 0; + + while (nGpuArchCoresPerSM[index].SM != -1) + { + if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor)) + { + return nGpuArchCoresPerSM[index].Cores; + } + + index++; + } + + // If we don't find the values, we default use the previous one + // to run properly + printf( + "MapSMtoCores for SM %d.%d is undefined." + " Default to use %d Cores/SM\n", + major, minor, nGpuArchCoresPerSM[index - 1].Cores); + return nGpuArchCoresPerSM[index - 1].Cores; +} + +inline const char *_ConvertSMVer2ArchName(int major, int minor) +{ + // Defines for GPU Architecture types (using the SM version to determine + // the GPU Arch name) + typedef struct + { + int SM; // 0xMm (hexidecimal notation), M = SM Major version, + // and m = SM minor version + const char *name; + } sSMtoArchName; + + sSMtoArchName nGpuArchNameSM[] = { + {0x30, "Kepler"}, + {0x32, "Kepler"}, + {0x35, "Kepler"}, + {0x37, "Kepler"}, + {0x50, "Maxwell"}, + {0x52, "Maxwell"}, + {0x53, "Maxwell"}, + {0x60, "Pascal"}, + {0x61, "Pascal"}, + {0x62, "Pascal"}, + {0x70, "Volta"}, + {0x72, "Xavier"}, + {0x75, "Turing"}, + {0x80, "Ampere"}, + {0x86, "Ampere"}, + {-1, "Graphics Device"}}; + + int index = 0; + + while (nGpuArchNameSM[index].SM != -1) + { + if (nGpuArchNameSM[index].SM == ((major << 4) + minor)) + { + return nGpuArchNameSM[index].name; + } + + index++; + } + + // If we don't find the values, we default use the previous one + // to run properly + printf( + "MapSMtoArchName for SM %d.%d is undefined." + " Default to use %s\n", + major, minor, nGpuArchNameSM[index - 1].name); + return nGpuArchNameSM[index - 1].name; +} +// end of GPU Architecture definitions + +// end of CUDA Helper Functions + +#endif // COMMON_HELPER_CUDA_H_ \ No newline at end of file diff --git a/TPs/TP4/CODE/Partie2/unified_mem.cu b/TPs/TP4/CODE/Partie2/unified_mem.cu new file mode 100644 index 0000000000000000000000000000000000000000..a44e601e131dd645892f8c8807126e9a1f5f7b5a --- /dev/null +++ b/TPs/TP4/CODE/Partie2/unified_mem.cu @@ -0,0 +1,116 @@ +#include <stdio.h> + +#include "helper_cuda.h" + +double get_time(void) +{ + struct timespec ts; + clock_gettime(CLOCK_MONOTONIC_RAW, &ts); + return (double) ts.tv_sec + (1e-9 * (double) ts.tv_nsec); +} + + +__global__ void scal_mul(float *A, float *B, float *C, size_t size){ + size_t i = blockDim.x * blockIdx.x + threadIdx.x; + if (i >= size) + return; + C[i] = A[i] * B[i]; +} + +int main(){ + + int size = 102300000; + + size_t blockSize = 1024; + dim3 dimBlock(blockSize, 1, 1); + dim3 dimGrid(size / blockSize + 1, 1, 1); + + + scal_mul<<<dimGrid, dimBlock>>>(NULL, NULL, NULL, 0); + + double t1; + double t2; + + t1 = get_time(); + + int sum1; + + float *h_A = NULL; + float *h_B = NULL; + float *h_C = NULL; + float *d_A = NULL; + float *d_B = NULL; + float *d_C = NULL; + + checkCudaErrors(cudaMallocHost(&h_A, sizeof(float) * size)); + checkCudaErrors(cudaMallocHost(&h_B, sizeof(float) * size)); + checkCudaErrors(cudaMallocHost(&h_C, sizeof(float) * size)); + checkCudaErrors(cudaMalloc(&d_A, sizeof(float) * size)); + checkCudaErrors(cudaMalloc(&d_B, sizeof(float) * size)); + checkCudaErrors(cudaMalloc(&d_C, sizeof(float) * size)); + + for(size_t i = 0; i < size; ++i) { + h_A[i] = 1; + h_B[i] = 1; + } + + checkCudaErrors(cudaMemcpy(d_A, h_A, sizeof(float) * size, cudaMemcpyHostToDevice)); + checkCudaErrors(cudaMemcpy(d_B, h_B, sizeof(float) * size, cudaMemcpyHostToDevice)); + + scal_mul<<<dimGrid, dimBlock>>>(d_A, d_B, d_C, size); + checkCudaErrors(cudaDeviceSynchronize()); + + + checkCudaErrors(cudaMemcpy(h_C, d_C, sizeof(float) * size, cudaMemcpyDeviceToHost)); + + sum1 = 0; + for(size_t i = 0; i < size; ++i) { + sum1 += h_C[i]; + } + + checkCudaErrors(cudaFree(d_A)); + checkCudaErrors(cudaFree(d_B)); + checkCudaErrors(cudaFree(d_C)); + checkCudaErrors(cudaFreeHost(h_A)); + checkCudaErrors(cudaFreeHost(h_B)); + checkCudaErrors(cudaFreeHost(h_C)); + + t2 = get_time(); + printf("Normal CUDA: %d in %g[s]\n", sum1, t2 - t1); + + + + + + t1 = get_time(); + + int sum2; + + float *u_A = NULL; + float *u_B = NULL; + float *u_C = NULL; + checkCudaErrors(cudaMallocManaged(&u_A, sizeof(float) * size)); + checkCudaErrors(cudaMallocManaged(&u_B, sizeof(float) * size)); + checkCudaErrors(cudaMallocManaged(&u_C, sizeof(float) * size)); + + for(size_t i = 0; i < size; ++i) { + u_A[i] = 1; + u_B[i] = 1; + } + + scal_mul<<<dimGrid, dimBlock>>>(u_A, u_B, u_C, size); + checkCudaErrors(cudaDeviceSynchronize()); + + sum2 = 0; + for(size_t i = 0; i < size; ++i) { + sum2 += u_C[i]; + } + + checkCudaErrors(cudaFree(u_A)); + checkCudaErrors(cudaFree(u_B)); + checkCudaErrors(cudaFree(u_C)); + + t2 = get_time(); + printf("Unified Mem CUDA: %d in %g[s]\n", sum2, t2 - t1); + +}