diff --git a/TPs/TP2/CODE/Makefile b/TPs/TP2/CODE/Makefile index 087df5f56444cbbc05026aa03a653f3c60a18cac..9e7e8976259658c2a547f97217d4ce248f28cbe0 100755 --- a/TPs/TP2/CODE/Makefile +++ b/TPs/TP2/CODE/Makefile @@ -1,21 +1,31 @@ CC=gcc -CFLAGS=-O3 -Wall +NVCC=nvcc +CFLAGS=-O3 -Wall -Wextra +NVFLAGS=-O3 -I . -Xcompiler="-Wall -Wextra -Werror -O3" LDFLAGS=-lm -EXE=tp2.exe +TARGET=tp2cpu +TARGETGPU=tp2gpu -all : $(EXE) +all: $(TARGET) $(TARGETGPU) -$(EXE) : tp2.o +$(TARGET): tp2_cpu.o $(CC) $(CFLAGS) -o $@ $< $(LDFLAGS) -%.o : %.c +$(TARGETGPU): tp2_gpu.o + $(NVCC) $(NVFLAGS) -o $@ $< $(LDFLAGS) + +%_cpu.o: %.c $(CC) $(CFLAGS) -c -o $@ $< -run : $(EXE) - ./$< 10240 +%_gpu.o: %.cu + $(NVCC) $(NVFLAGS) -c -o $@ $< + +run: $(TARGET) $(TARGETGPU) + ./$(TARGET) 10240 + ./$(TARGETGPU) 10240 proper : rm -f *.o clean : - rm -f *.o $(EXE) + rm -f *.o $(TARGET) $(TARGETGPU) diff --git a/TPs/TP2/CODE/helper_cuda.h b/TPs/TP2/CODE/helper_cuda.h new file mode 100644 index 0000000000000000000000000000000000000000..3dd446dced85de9d10eb9f1597c56099fa358bdb --- /dev/null +++ b/TPs/TP2/CODE/helper_cuda.h @@ -0,0 +1,183 @@ +/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of NVIDIA CORPORATION nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +//////////////////////////////////////////////////////////////////////////////// +// These are CUDA Helper functions for initialization and error checking + +#ifndef COMMON_HELPER_CUDA_H_ +#define COMMON_HELPER_CUDA_H_ + +#pragma once + +#include <stdint.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +#ifndef MAX +#define MAX(a, b) (a > b ? a : b) +#endif + +#ifndef MIN +#define MIN(a, b) (a < b ? a : b) +#endif + +#define checkCudaErrors(val) check((val), #val, __FILE__, __LINE__) +#define getLastCudaError(msg) __getLastCudaError(msg, __FILE__, __LINE__) + +void check(cudaError_t result, char const *const func, const char *const file, + int const line) +{ + if (result) + { + fprintf(stderr, "CUDA error at %s:%d code=%d (%s) \"%s\" \n", file, line, (int)result, cudaGetErrorName(result), func); + exit(EXIT_FAILURE); + } +} + +inline void __getLastCudaError(const char *errorMessage, const char *file, + const int line) +{ + cudaError_t err = cudaGetLastError(); + + if (cudaSuccess != err) + { + fprintf(stderr, + "%s(%i) : getLastCudaError() CUDA error :" + " %s : (%d) %s.\n", + file, line, errorMessage, (int)(err), + cudaGetErrorString(err)); + exit(EXIT_FAILURE); + } +} + +// Beginning of GPU Architecture definitions +int _ConvertSMVer2Cores(int major, int minor) +{ + // Defines for GPU Architecture types (using the SM version to determine + // the # of cores per SM + typedef struct + { + int SM; // 0xMm (hexidecimal notation), M = SM Major version, + // and m = SM minor version + int Cores; + } sSMtoCores; + + sSMtoCores nGpuArchCoresPerSM[] = { + {0x30, 192}, + {0x32, 192}, + {0x35, 192}, + {0x37, 192}, + {0x50, 128}, + {0x52, 128}, + {0x53, 128}, + {0x60, 64}, + {0x61, 128}, + {0x62, 128}, + {0x70, 64}, + {0x72, 64}, + {0x75, 64}, + {0x80, 64}, + {0x86, 128}, + {0x87, 128}, + {-1, -1}}; + + int index = 0; + + while (nGpuArchCoresPerSM[index].SM != -1) + { + if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor)) + { + return nGpuArchCoresPerSM[index].Cores; + } + + index++; + } + + // If we don't find the values, we default use the previous one + // to run properly + printf( + "MapSMtoCores for SM %d.%d is undefined." + " Default to use %d Cores/SM\n", + major, minor, nGpuArchCoresPerSM[index - 1].Cores); + return nGpuArchCoresPerSM[index - 1].Cores; +} + +inline const char *_ConvertSMVer2ArchName(int major, int minor) +{ + // Defines for GPU Architecture types (using the SM version to determine + // the GPU Arch name) + typedef struct + { + int SM; // 0xMm (hexidecimal notation), M = SM Major version, + // and m = SM minor version + const char *name; + } sSMtoArchName; + + sSMtoArchName nGpuArchNameSM[] = { + {0x30, "Kepler"}, + {0x32, "Kepler"}, + {0x35, "Kepler"}, + {0x37, "Kepler"}, + {0x50, "Maxwell"}, + {0x52, "Maxwell"}, + {0x53, "Maxwell"}, + {0x60, "Pascal"}, + {0x61, "Pascal"}, + {0x62, "Pascal"}, + {0x70, "Volta"}, + {0x72, "Xavier"}, + {0x75, "Turing"}, + {0x80, "Ampere"}, + {0x86, "Ampere"}, + {-1, "Graphics Device"}}; + + int index = 0; + + while (nGpuArchNameSM[index].SM != -1) + { + if (nGpuArchNameSM[index].SM == ((major << 4) + minor)) + { + return nGpuArchNameSM[index].name; + } + + index++; + } + + // If we don't find the values, we default use the previous one + // to run properly + printf( + "MapSMtoArchName for SM %d.%d is undefined." + " Default to use %s\n", + major, minor, nGpuArchNameSM[index - 1].name); + return nGpuArchNameSM[index - 1].name; +} +// end of GPU Architecture definitions + +// end of CUDA Helper Functions + +#endif // COMMON_HELPER_CUDA_H_ \ No newline at end of file diff --git a/TPs/TP2/CODE/tp2.cu b/TPs/TP2/CODE/tp2.cu index 137cfc16847247bc16b5c8ec7f30d6ebb6adaa07..5396b6dcb91f57c70400024afdd6cbeea2fce868 100755 --- a/TPs/TP2/CODE/tp2.cu +++ b/TPs/TP2/CODE/tp2.cu @@ -1,67 +1,147 @@ #include <stdlib.h> #include <stdio.h> #include <math.h> +#include <time.h> +#include <inttypes.h> + +#include "helper_cuda.h" + +#define gettime(t) clock_gettime(CLOCK_MONOTONIC_RAW, t) +#define get_sub_seconde(t) (1e-9*(double)t.tv_nsec) #define NSTREAMS 4 +double get_elapsedtime(void) +{ + struct timespec st; + int err = gettime(&st); + if (err !=0) return 0; + return (double)st.tv_sec + get_sub_seconde(st); +} + void verif(float *out, int sz) { - float err = 0.; - - for(int i = 0 ; i < sz ; i++) - { - err += abs(out[i] - exp( - abs(sin(i * 1.0)) )); - } - - if (err/sz < 1.e-4) - { - fprintf(stdout, "TEST PASSED (error %3.f < 1.e-4)\n", err/sz); - } - else - { - fprintf(stderr, "TEST FAILED (error %3.f > 1.e-4)\n", err/sz); - } + float err = 0.; + + for(int i = 0 ; i < sz ; i++) + { + err += abs(out[i] - exp( - abs(sin(i * 1.0)) )); + } + + if (err/sz < 1.e-4) + { + fprintf(stdout, "TEST PASSED (error %3.5f < 1.e-4)\n", err/sz); + } + else + { + fprintf(stderr, "TEST FAILED (error %3.5f > 1.e-4)\n", err/sz); + } } void func(float *out, int size) { - for(int i = 0; i < size; ++i) - { - out[i] = exp( - abs(out[i]) ); - } + for(int i = 0; i < size; ++i) + { + out[i] = exp( - abs(out[i]) ); + } +} + +__global__ void funck(float *out, int size) +{ + int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index >= size) + return; + + out[index] = exp( - abs(out[index])); } int main(int argc, char** argv) { - int size = 1024; - if (argc == 2) - { - size = atoi(argv[1]); - } + int size = 1024; + if (argc >= 2) + { + size = atoi(argv[1]); + } + int nstreams = NSTREAMS; + if (argc >= 3) + { + nstreams = atoi(argv[2]); + } + + int subsize = size / nstreams; + + cudaStream_t streams[nstreams]; + for (int i = 0; i < nstreams; ++i) + { + checkCudaErrors(cudaStreamCreate(&streams[i])); + } + + float *tab_h = NULL; + float *tab_d = NULL; + + checkCudaErrors(cudaMallocHost((void **) &tab_h, sizeof(float) * size)); + checkCudaErrors(cudaMalloc((void **) &tab_d, sizeof(float) * size)); + + for(int i = 0; i < size; ++i) + { + tab_h[i] = sin(i * 1.); + } + + //func(tab_h, size); + //verif(tab_h, size); + + int blocksize = 1024; + dim3 dimBlock(blocksize, 1, 1); + dim3 dimGrid((size / blocksize / nstreams) + 1, 1, 1); + + double t0, t1; + cudaEvent_t ce[nstreams * 2]; + for(int i = 0; i < nstreams * 2; ++i) + { + cudaEventCreate(&ce[i]); + } + + t0 = get_elapsedtime(); - size *= NSTREAMS; + for(int i = 0; i < nstreams; ++i) + { + checkCudaErrors(cudaMemcpyAsync( + tab_d + (subsize * i), tab_h + (subsize * i), + sizeof(float) * subsize, + cudaMemcpyHostToDevice, streams[i])); + cudaEventRecord(ce[i * 2]); + funck<<<dimGrid, dimBlock, 0, streams[i]>>>( + tab_d + (subsize * i), subsize); + cudaEventRecord(ce[i * 2 + 1]); + checkCudaErrors(cudaMemcpyAsync( + tab_h + (subsize * i), tab_d + (subsize * i), + sizeof(float) * subsize, + cudaMemcpyDeviceToHost, streams[i])); + } - float *tab = NULL; - tab = (float*) malloc(sizeof(float) * size); + cudaDeviceSynchronize(); + getLastCudaError("funck kernel failed"); - float d_tab; - cudaMalloc( + t1 = get_elapsedtime(); - if(tab == NULL) - { - fprintf(stderr, "Bad allocation\n"); - return -1; - } + fprintf(stdout, "Total Time (%d): %lf ms\n", nstreams, (t1 - t0) * 1000); + float cudaDuration, cudaTotalDuration = 0; + for(int i = 0; i < nstreams; ++i){ + cudaEventElapsedTime(&cudaDuration, ce[i * 2], ce[i * 2 + 1]); + cudaTotalDuration += cudaDuration; + fprintf(stdout, "Time of stream %d: %lf ms\n", i, cudaDuration); + } + fprintf(stdout, "Total Time of streams: %lf ms\n", cudaTotalDuration); - for(int i = 0; i < size; ++i) - { - tab[i] = sin(i * 1.); - } + verif(tab_h, size); - func(tab, size); + checkCudaErrors(cudaFree(tab_d)); + cudaFreeHost(tab_h); - verif(tab, size); + for (int i = 0; i < nstreams; ++i) + { + checkCudaErrors(cudaStreamDestroy(streams[i])); + } - free(tab); - return 0; + return 0; }