diff --git a/TPs/TP3/CODE/AXPY/axpy.c b/TPs/TP3/CODE/AXPY/axpy.c index 2356954914ad85c3f86039c34f7c534982b903d4..bea8defa68ac92f2195be8b4b13868f0701c321f 100755 --- a/TPs/TP3/CODE/AXPY/axpy.c +++ b/TPs/TP3/CODE/AXPY/axpy.c @@ -4,28 +4,30 @@ int main(int argc, char** argv) { - double alpha = 2; - double *X = NULL, *Y = NULL; - int N = 1000; - if (argc > 1) N = atoi(argv[1]); + double alpha = 2; + double *X = NULL, *Y = NULL; + int N = 100000000; + if (argc > 1) N = atoi(argv[1]); - X = (double*) malloc(sizeof(double) * N); - Y = (double*) malloc(sizeof(double) * N); + X = (double*) malloc(sizeof(double) * N); + Y = (double*) malloc(sizeof(double) * N); - for(int i = 0; i < N; ++i) - { - X[i] = i; - Y[i] = X[i] + i; - } + for(int i = 0; i < N; ++i) + { + X[i] = i; + Y[i] = X[i] + i; + } - for(int i = 0; i < N; ++i) - { - Y[i] += alpha * X[i]; - } + int runningOnGPU = 0; + #pragma omp target teams distribute parallel for map(to: X[:N]) map(tofrom: Y[:N]) map(from: runningOnGPU) + for(int i = 0; i < N; ++i) + { + Y[i] += alpha * X[i]; + } - int stop = (N > 5)?5:N; - for(int i = 0; i < stop; ++i) - fprintf(stdout, "Y[%d] = %f\n", i, Y[i]); + int stop = (N > 5)?5:N; + for(int i = 0; i < stop; ++i) + fprintf(stdout, "Y[%d] = %f\n", i, Y[i]); - return 0; + return 0; } diff --git a/TPs/TP3/CODE/DGEMM/dgemm.c b/TPs/TP3/CODE/DGEMM/dgemm.c index 817910f167789628a8d9885794508f5b6b8d2176..b663d00ab83ab5159d492533fb650e7051035526 100755 --- a/TPs/TP3/CODE/DGEMM/dgemm.c +++ b/TPs/TP3/CODE/DGEMM/dgemm.c @@ -10,72 +10,78 @@ typedef struct timespec struct_time; */ double get_elapsedtime(void) { - struct_time st; - int err = gettime(&st); - if (err !=0) return 0; - return (double)st.tv_sec + get_sub_seconde(st); + struct_time st; + int err = gettime(&st); + if (err !=0) return 0; + return (double)st.tv_sec + get_sub_seconde(st); } void mult(int N, float *A, float* B, float* C) { - int i = 0, j = 0, k = 0; - int total_size = N*N; - for(i = 0; i < N; ++i) - { - for(j = 0; j < N; ++j) - { - float val = 0.; - for(k = 0; k < N; ++k) - { - val += A[i * N + k] * B[k * N + j]; - } - C[i * N + j] = val; - } - } + int i = 0, j = 0, k = 0; + //int total_size = N*N; + #pragma omp target teams distribute parallel for map(to: A) map(to: B) map(from: C) + for(i = 0; i < N; ++i) + { + for(j = 0; j < N; ++j) + { + for(k = 0; k < N; ++k) + { + C[i * N + j] += A[i * N + k] * B[k * N + j]; + } + } + } +} + +[[clang::optnone]] +void donotopti(float *C){ + return; } int main(int argc, char** argv) { - double t0 = 0., t1 = 0., duration = 0.; + double t0 = 0., t1 = 0., duration = 0.; - float *A = NULL; - float *B = NULL; - float *C = NULL; + float *A = NULL; + float *B = NULL; + float *C = NULL; - int i = 0; + int i = 0; - int N = 1024; - if(argc > 1) - { - N = atoi(argv[1]); - } + int N = 1024; + if(argc > 1) + { + N = atoi(argv[1]); + } - fprintf(stdout, "> Matrix Multiplication Kernel...\n"); - fprintf(stdout, " Size: %dx%d\n", N, N); + fprintf(stdout, "> Matrix Multiplication Kernel...\n"); + fprintf(stdout, " Size: %dx%d\n", N, N); #pragma omp parallel default(none) shared(stdout) - { + { #pragma omp single - fprintf(stdout, " Running on %d threads\n", omp_get_num_threads()); - } + fprintf(stdout, " Running on %d threads\n", omp_get_num_threads()); + } + + A = (float*) calloc(N * N, sizeof(float)); + B = (float*) calloc(N * N, sizeof(float)); + C = (float*) calloc(N * N, sizeof(float)); - A = (float*) calloc(N * N, sizeof(float)); - B = (float*) calloc(N * N, sizeof(float)); - C = (float*) calloc(N * N, sizeof(float)); + for(i = 0; i < N * N; ++i) + { + A[i] = 1. * i; + B[i] = N*N - (1. * i); + C[i] = 0.; + } - for(i = 0; i < N * N; ++i) - { - A[i] = 1. * i; - B[i] = N*N - (1. * i); - C[i] = 0.; - } + t0 = get_elapsedtime(); + mult(N, A, B, C); + t1 = get_elapsedtime(); - t0 = get_elapsedtime(); - mult(N, A, B, C); - t1 = get_elapsedtime(); + duration = (t1 - t0); - duration = (t1 - t0); + fprintf(stdout, " Elapsed Time : %f\n", duration); - fprintf(stdout, " Elapsed Time : %f\n", duration); + donotopti(C); - return 0; + return 0; } diff --git a/TPs/TP3/CODE/ONGPU/Makefile b/TPs/TP3/CODE/ONGPU/Makefile index 76dd3a13d868d01fc5218a2ea6c702ab4c921ac3..d57f90448d1c99ed85dd1f1646ecd841a5cb646e 100755 --- a/TPs/TP3/CODE/ONGPU/Makefile +++ b/TPs/TP3/CODE/ONGPU/Makefile @@ -1,4 +1,4 @@ -CC=clang -fopenmp -fopenmp-targets=nvptx64-nvidia-cuda -Xopenmp-target -march=sm_61 +CC=clang -fopenmp -fopenmp-targets=nvptx64-nvidia-cuda CFLAGS=-O3 -Wall LDFLAGS=-lm -lrt diff --git a/TPs/TP3/CODE/ONGPU/ongpu.c b/TPs/TP3/CODE/ONGPU/ongpu.c index 901ab15978b8401e87db2677f19c4a8b5a2df1e7..d10f8f9c17b099037ea4a981c79c2bfd85c211d9 100755 --- a/TPs/TP3/CODE/ONGPU/ongpu.c +++ b/TPs/TP3/CODE/ONGPU/ongpu.c @@ -17,5 +17,17 @@ int main() else printf("### Unable to use the GPU, using CPU! ###\n"); + printf("Nb. devices: %d\n", omp_get_num_devices()); + + int team = 0; +#pragma omp target map(from: team) + { +#pragma omp teams + { + team = omp_get_num_teams(); + } + } + + printf("Nb. teams: %d\n", team); return 0; }