diff --git a/TPs/TP1/CODE/Partie1/tp1_check.cu b/TPs/TP1/CODE/Partie1/tp1_check.cu new file mode 100755 index 0000000000000000000000000000000000000000..2b55dc78199539148db061e497ede0086d50bc96 --- /dev/null +++ b/TPs/TP1/CODE/Partie1/tp1_check.cu @@ -0,0 +1,87 @@ +#include <stdio.h> +#include <stdlib.h> + +__global__ void kernel(int *v) +{ + *v = 1; +} + +int main(int argc, char **argv) +{ + int sz_in_bytes = sizeof(int); + + int *h_a; + int *d_a; + + int nDevices; + + // Querying the CUDA device properties + cudaGetDeviceCount(&nDevices); + for (int i = 0; i < nDevices; i++) { + cudaDeviceProp prop; + cudaGetDeviceProperties(&prop, i); + printf("Device Number: %d\n", i); + printf(" Device name: %s\n", prop.name); + printf(" Memory Clock Rate (KHz): %d\n", + prop.memoryClockRate); + printf(" Memory Bus Width (bits): %d\n", + prop.memoryBusWidth); + printf(" Peak Memory Bandwidth (GB/s): %f\n", + 2.0*prop.memoryClockRate*(prop.memoryBusWidth/8)/1.0e6); + printf(" Device Compute Capability: %d.%d\n", + prop.major, prop.minor); + printf(" > Kernel Configuration information\n"); + printf(" - Warp Size: %d\n", + prop.warpSize); + printf(" - Max Threads Per Block: %d\n", + prop.maxThreadsPerBlock); + printf(" - Max size of each dimension of a Block: (%d, %d, %d)\n", + prop.maxThreadsDim[0], prop.maxThreadsDim[1], prop.maxThreadsDim[2]); + printf(" - Max size of each dimension of a Grid: (%d, %d, %d)\n", + prop.maxGridSize[0], prop.maxGridSize[1], prop.maxGridSize[2]); + printf(" > Memory information\n"); + printf(" - Total Global Memory size (bytes): %llu\n", + prop.totalGlobalMem); + printf(" - Max Shared Memory size per block (bytes): %llu\n", + prop.sharedMemPerBlock); + printf(" - Max Constant Memory size (bytes): %llu\n\n", + prop.totalConstMem); + } + + // Allocation on host (malloc) + h_a = (int*)malloc(sz_in_bytes); + *h_a = 0; + + // Allocation on device (cudaMalloc) + cudaMalloc((void**)&d_a, sz_in_bytes); + + // Copying data to device (cudaMemcpy) + cudaMemcpy(d_a, h_a, sz_in_bytes, cudaMemcpyHostToDevice); + + // Kernel configuration + dim3 dimBlock(1, 1, 1); + dim3 dimGrid(1, 1, 1); + + // Kernel launch + kernel<<<dimGrid , dimBlock>>>(d_a); + + // Retrieving data from device (cudaMemcpy) + cudaMemcpy(h_a, d_a, sz_in_bytes, cudaMemcpyDeviceToHost); + + // Freeing on device (cudaFree) + cudaFree(d_a); + + // Verifying if + if(*h_a == 1) + { + fprintf(stderr, "TEST PASSED !\n"); + } + else + { + fprintf(stderr, "TEST FAILED !\n"); + } + + free(h_a); + + return 0; +} diff --git a/TPs/TP1/CODE/Partie2/Makefile b/TPs/TP1/CODE/Partie2/Makefile new file mode 100755 index 0000000000000000000000000000000000000000..4ae22712a5d3de26583073d9da7dd646cc80b99c --- /dev/null +++ b/TPs/TP1/CODE/Partie2/Makefile @@ -0,0 +1,20 @@ +CC=gcc +CFLAGS=-O3 -Wall +EXE=mult.exe + +all : $(EXE) + +$(EXE) : dgemm.o + $(CC) $(CFLAGS) -o $@ $< + +%.o : %.c + $(CC) $(CFLAGS) -c -o $@ $< + +run : mult.exe + ./$< 1000 + +proper : + rm -f *.o + +clean : + rm -f *.o $(EXE) diff --git a/TPs/TP1/CODE/Partie2/dgemm.c b/TPs/TP1/CODE/Partie2/dgemm.c new file mode 100755 index 0000000000000000000000000000000000000000..90f051ebb7862ce96116ed4488b3836a2efaf480 --- /dev/null +++ b/TPs/TP1/CODE/Partie2/dgemm.c @@ -0,0 +1,104 @@ +#include <stdio.h> +#include <stdlib.h> +#include <time.h> +#include <inttypes.h> +#include <math.h> + +#define gettime(t) clock_gettime(CLOCK_MONOTONIC_RAW, t) +#define get_sub_seconde(t) (1e-9*(double)t.tv_nsec) +/** return time in second +*/ +double get_elapsedtime(void) +{ + struct timespec st; + int err = gettime(&st); + if (err !=0) return 0; + return (double)st.tv_sec + get_sub_seconde(st); +} + +int verify_matrix(double *matRef, double *matOut, int N) { + double diff = 0.0; + uint64_t i; + uint64_t size = N*N; + for (i = 0; i < size; i++) { + diff = fabs(matRef[i] - matOut[i]); + if (diff > 0.01) { + printf("Divergence! Should %5.2f, Is %5.2f (Diff %5.2f) at %d\n", + matRef[i], matOut[i], diff, i); + return 1; + } + } + return 0; +} + + +void init(double* A, double* B, double* C, int size) +{ + int i = 0, j = 0; + + srand(2019); + + for(i = 0; i < size; i++) + { + for(j = 0; j < size; j++) + { + A[i * size + j] = (double) (rand() % 10) + 0.01 * (rand() % 5); + B[i * size + j] = (double) (rand() % 10) + 0.01 * (rand() % 5); + C[i * size + j] = 0.0; + } + } +} + +void mult(double* A, double* B, double* C, int size) +{ + int i = 0, j = 0, k = 0; + + for(i = 0; i < size; i++) + { + for(j = 0; j < size; j++) + { + double sum = 0.; + for(k = 0; k < size; k++) + { + sum += A[i * size + k] * B[k * size + j]; + } + C[i * size + j] = sum; + } + } +} + +int main(int argc, char** argv){ + int N = 0; + + double *A = NULL; + double *B = NULL; + double *C = NULL; + + double t0 = 0., t1 = 0., duration = 0.; + + N = (argc < 2)?1000:atoi(argv[1]); + fprintf(stdout, "Matrix Multiplication\n Size: %dx%d\n", N, N); + + // Memory allocation + A = (double*) malloc(sizeof(double) * N * N); + B = (double*) malloc(sizeof(double) * N * N); + C = (double*) malloc(sizeof(double) * N * N); + + // Value initialization + init(A, B, C, N); + + // Compute multiplication + t0 = get_elapsedtime(); + mult(A, B, C, N); + t1 = get_elapsedtime(); + + // Pretty print + duration = (t1 - t0); + uint64_t N_u64 = (uint64_t) N; + uint64_t nb_op = N_u64 * N_u64 * N_u64; + fprintf(stdout, "Performance results: \n"); + fprintf(stdout, " Time: %lf s\n", duration); + fprintf(stdout, " MFlops: %.2f\n", (nb_op / duration)*1E-6); + + return 0; +} diff --git a/TPs/TP1/DOCS/CUDA_C_Best_Practices_Guide.pdf b/TPs/TP1/DOCS/CUDA_C_Best_Practices_Guide.pdf new file mode 100755 index 0000000000000000000000000000000000000000..d32a346a4a6814537a7cc1601940479b7b766b1c Binary files /dev/null and b/TPs/TP1/DOCS/CUDA_C_Best_Practices_Guide.pdf differ diff --git a/TPs/TP1/DOCS/CUDA_C_Programming_Guide.pdf b/TPs/TP1/DOCS/CUDA_C_Programming_Guide.pdf new file mode 100755 index 0000000000000000000000000000000000000000..4b66369eefbc60fcabb586843cf67f3d44d98c6a Binary files /dev/null and b/TPs/TP1/DOCS/CUDA_C_Programming_Guide.pdf differ diff --git a/TPs/TP1/SUJET/tp1.pdf b/TPs/TP1/SUJET/tp1.pdf new file mode 100644 index 0000000000000000000000000000000000000000..68e14831f27b7a20e1351e4346c717d606af12ee Binary files /dev/null and b/TPs/TP1/SUJET/tp1.pdf differ diff --git a/TPs/TP2/CODE/Makefile b/TPs/TP2/CODE/Makefile new file mode 100755 index 0000000000000000000000000000000000000000..087df5f56444cbbc05026aa03a653f3c60a18cac --- /dev/null +++ b/TPs/TP2/CODE/Makefile @@ -0,0 +1,21 @@ +CC=gcc +CFLAGS=-O3 -Wall +LDFLAGS=-lm +EXE=tp2.exe + +all : $(EXE) + +$(EXE) : tp2.o + $(CC) $(CFLAGS) -o $@ $< $(LDFLAGS) + +%.o : %.c + $(CC) $(CFLAGS) -c -o $@ $< + +run : $(EXE) + ./$< 10240 + +proper : + rm -f *.o + +clean : + rm -f *.o $(EXE) diff --git a/TPs/TP2/CODE/tp2.c b/TPs/TP2/CODE/tp2.c new file mode 100755 index 0000000000000000000000000000000000000000..52054e51c1b1364f90d2335a6d2e58276c663ab5 --- /dev/null +++ b/TPs/TP2/CODE/tp2.c @@ -0,0 +1,64 @@ +#include <stdlib.h> +#include <stdio.h> +#include <math.h> + +#define NSTREAMS 4 + +void verif(float *out, int sz) +{ + float err = 0.; + + for(int i = 0 ; i < sz ; i++) + { + err += abs(out[i] - exp( - abs(sin(i * 1.0)) )); + } + + if (err/sz < 1.e-4) + { + fprintf(stdout, "TEST PASSED (error %3.f < 1.e-4)\n", err/sz); + } + else + { + fprintf(stderr, "TEST FAILED (error %3.f > 1.e-4)\n", err/sz); + } +} + +void func(float *out, int size) +{ + for(int i = 0; i < size; ++i) + { + out[i] = exp( - abs(out[i]) ); + } +} + +int main(int argc, char** argv) +{ + int size = 1024; + if (argc == 2) + { + size = atoi(argv[1]); + } + + size *= NSTREAMS; + + float *tab = NULL; + tab = (float*) malloc(sizeof(float) * size); + + if(tab == NULL) + { + fprintf(stderr, "Bad allocation\n"); + return -1; + } + + for(int i = 0; i < size; ++i) + { + tab[i] = sin(i * 1.); + } + + func(tab, size); + + verif(tab, size); + + free(tab); + return 0; +} diff --git a/TPs/TP2/CODE/tp2.cu b/TPs/TP2/CODE/tp2.cu new file mode 100755 index 0000000000000000000000000000000000000000..137cfc16847247bc16b5c8ec7f30d6ebb6adaa07 --- /dev/null +++ b/TPs/TP2/CODE/tp2.cu @@ -0,0 +1,67 @@ +#include <stdlib.h> +#include <stdio.h> +#include <math.h> + +#define NSTREAMS 4 + +void verif(float *out, int sz) +{ + float err = 0.; + + for(int i = 0 ; i < sz ; i++) + { + err += abs(out[i] - exp( - abs(sin(i * 1.0)) )); + } + + if (err/sz < 1.e-4) + { + fprintf(stdout, "TEST PASSED (error %3.f < 1.e-4)\n", err/sz); + } + else + { + fprintf(stderr, "TEST FAILED (error %3.f > 1.e-4)\n", err/sz); + } +} + +void func(float *out, int size) +{ + for(int i = 0; i < size; ++i) + { + out[i] = exp( - abs(out[i]) ); + } +} + +int main(int argc, char** argv) +{ + int size = 1024; + if (argc == 2) + { + size = atoi(argv[1]); + } + + size *= NSTREAMS; + + float *tab = NULL; + tab = (float*) malloc(sizeof(float) * size); + + float d_tab; + cudaMalloc( + + if(tab == NULL) + { + fprintf(stderr, "Bad allocation\n"); + return -1; + } + + for(int i = 0; i < size; ++i) + { + tab[i] = sin(i * 1.); + } + + func(tab, size); + + verif(tab, size); + + free(tab); + return 0; +} diff --git a/TPs/TP2/DOCS/StreamsAndConcurrencyWebinar.pdf b/TPs/TP2/DOCS/StreamsAndConcurrencyWebinar.pdf new file mode 100755 index 0000000000000000000000000000000000000000..fa2d3b5f6e36b0f6a8ca43db3bc754d7f8de9c0b Binary files /dev/null and b/TPs/TP2/DOCS/StreamsAndConcurrencyWebinar.pdf differ diff --git a/TPs/TP2/SUJET/tp2.pdf b/TPs/TP2/SUJET/tp2.pdf new file mode 100644 index 0000000000000000000000000000000000000000..5a3f0a389860d388f9b37b3cedb1793103510415 Binary files /dev/null and b/TPs/TP2/SUJET/tp2.pdf differ