diff --git a/TDs/TD2/CORRECTION/Makefile b/TDs/TD2/CORRECTION/Makefile index b51970c246b010e4eff4e06c30f78f7212bc8cc7..2e07ac4f537854305082a4a968a760f15963edce 100755 --- a/TDs/TD2/CORRECTION/Makefile +++ b/TDs/TD2/CORRECTION/Makefile @@ -5,14 +5,13 @@ CFLAGS=-O3 -Wall EXE1=pi.exe EXE2=pi_omp.exe # EXE3=pi_task.exe -#EXE3=pi_cuda.exe -#EXE4=pi_cuda_shared.exe -#EXE5=pi_cuda_shared_2.exe -#EXE6=pi_cuda_shared_3.exe -#EXE7=pi_cuda_shared_4.exe +EXE3=pi_cuda.exe +EXE4=pi_cuda_shared.exe +EXE5=pi_cuda_shared_2.exe +EXE6=pi_cuda_shared_3.exe +EXE7=pi_cuda_shared_4.exe -all : $(EXE1) $(EXE2) -#$(EXE3) $(EXE4) $(EXE5) $(EXE6) $(EXE7) +all : $(EXE1) $(EXE2) $(EXE3) $(EXE4) $(EXE5) $(EXE6) $(EXE7) $(EXE1) : pi_sequentiel.o $(CC) $(CFLAGS) -o $@ $< @@ -20,9 +19,6 @@ $(EXE1) : pi_sequentiel.o $(EXE2) : pi_omp.o $(CC) $(CFLAGS_OMP) -o $@ $< -# $(EXE3) : pi_task.o -# $(CC) $(CFLAGS_OMP) -o $@ $< - $(EXE3) : pi_cuda.cu $(CUDA_CC) -O3 -o $@ $< diff --git a/TDs/TD2/CORRECTION/pi_cuda.cu b/TDs/TD2/CORRECTION/pi_cuda.cu new file mode 100755 index 0000000000000000000000000000000000000000..7762ce1806913235615124be84680dfd6720b2f5 --- /dev/null +++ b/TDs/TD2/CORRECTION/pi_cuda.cu @@ -0,0 +1,84 @@ +#include <stdlib.h> +#include <stdio.h> +#include <cuda.h> +#include <math.h> +#include <time.h> +#include <curand_kernel.h> + +#define gettime(t) clock_gettime(CLOCK_MONOTONIC_RAW, t) +#define get_sub_seconde(t) (1e-9*(double)t.tv_nsec) +/** return time in second +*/ +double get_elapsedtime(void) +{ + struct timespec st; + int err = gettime(&st); + if (err !=0) return 0; + return (double)st.tv_sec + get_sub_seconde(st); +} + +/* QUESTION 3 */ +#define TRIALS_PER_THREAD 4096 +#define BLOCKS 512 +#define THREADS 256 +/* FIN QUESTION 3*/ + +/* QUESTION 6 */ +__global__ void gpu_monte_carlo(float *estimate, curandState *states) { + unsigned int tid = threadIdx.x + blockDim.x * blockIdx.x; + int points_in_circle = 0; + float x = 0., y = 0.; + + curand_init(2020, tid, 0, &states[tid]); // Initialize CURAND + + for(int i = 0; i < TRIALS_PER_THREAD; i++) + { + x = curand_uniform (&states[tid]); + y = curand_uniform (&states[tid]); + points_in_circle += (x*x + y*y <= 1.0f); // count if x & y is in the circle. + } + estimate[tid] = 4.0f * points_in_circle / (float) TRIALS_PER_THREAD; // return estimate of pi +} +/* FIN QUESTION 6 */ + +int main (int argc, char *argv[]) { + float h_counts[BLOCKS * THREADS] = { 0 }; + double t0 = 0., t1 = 0., duration = 0.; + + printf("# of trials per thread = %d, # of blocks = %d, # of threads/block = %d.\n", TRIALS_PER_THREAD, BLOCKS, THREADS); + +/* QUESTION 4 */ + float *d_counts; + cudaMalloc((void **) &d_counts, BLOCKS * THREADS * sizeof(float)); // allocate device mem. for counts +/* FIN QUESTION 4 */ + +/* QUESTION 6 */ + curandState *d_states; + cudaMalloc( (void **)&d_states, THREADS * BLOCKS * sizeof(curandState) ); +/* FIN QUESTION 6 */ + + t0 = get_elapsedtime(); +/* QUESTION 3 */ + gpu_monte_carlo<<<BLOCKS, THREADS>>>(d_counts, d_states); +/* FIN QUESTION 3*/ + +/* QUESTION 5 */ + cudaMemcpy(h_counts, d_counts, BLOCKS * THREADS * sizeof(float), cudaMemcpyDeviceToHost); // return results +/* FIN QUESTION 5 */ + + float pi_gpu = 0.f; + for(int i = 0; i < BLOCKS * THREADS; i++) + { + pi_gpu += h_counts[i]; + } + + pi_gpu /= (BLOCKS * THREADS); + + t1 = get_elapsedtime(); + duration = (t1 - t0); + + printf("GPU pi calculated in %lf s.\n", duration); + fprintf(stdout, "Pi ~= %lf\n", pi_gpu); + + return 0; +} diff --git a/TDs/TD2/CORRECTION/pi_cuda_shared.cu b/TDs/TD2/CORRECTION/pi_cuda_shared.cu new file mode 100755 index 0000000000000000000000000000000000000000..53212f22f7af28aa646b7564efef5b26b37ea822 --- /dev/null +++ b/TDs/TD2/CORRECTION/pi_cuda_shared.cu @@ -0,0 +1,92 @@ +#include <stdlib.h> +#include <stdio.h> +#include <cuda.h> +#include <math.h> +#include <time.h> +#include <curand_kernel.h> + +#define gettime(t) clock_gettime(CLOCK_MONOTONIC_RAW, t) +#define get_sub_seconde(t) (1e-9*(double)t.tv_nsec) +/** return time in second +*/ +double get_elapsedtime(void) +{ + struct timespec st; + int err = gettime(&st); + if (err !=0) return 0; + return (double)st.tv_sec + get_sub_seconde(st); +} + +/* QUESTION 3 */ +#define TRIALS_PER_THREAD 4096 +#define BLOCKS 512 +#define THREADS 256 +/* FIN QUESTION 3*/ + +/* QUESTION 6 */ +__global__ void gpu_monte_carlo(float *estimate) { + unsigned int gtid = threadIdx.x + blockDim.x * blockIdx.x; + unsigned int tid = threadIdx.x; + int points_in_circle = 0; + float x = 0., y = 0.; + __shared__ float estimate_s[THREADS]; + __shared__ curandState states_s[THREADS]; + + curand_init(2020, gtid, 0, &states_s[tid]); // Initialize CURAND + + for(int i = 0; i < TRIALS_PER_THREAD; i++) + { + x = curand_uniform (&states_s[tid]); + y = curand_uniform (&states_s[tid]); + points_in_circle += (x*x + y*y <= 1.0f); // count if x & y is in the circle. + } + estimate_s[tid] = 4.0f * points_in_circle / (float) TRIALS_PER_THREAD; // return estimate of pi + __syncthreads(); + + for (unsigned int s=1; s < THREADS; s*=2) { + if (tid % (2*s) == 0) + estimate_s[tid] += estimate_s[tid + s]; + __syncthreads(); + } + + if (tid == 0) + estimate[blockIdx.x] = estimate_s[0]; +} +/* FIN QUESTION 6 */ + +int main (int argc, char *argv[]) { + float h_counts[BLOCKS * THREADS] = { 0 }; + double t0 = 0., t1 = 0., duration = 0.; + + printf("# of trials per thread = %d, # of blocks = %d, # of threads/block = %d.\n", TRIALS_PER_THREAD, BLOCKS, THREADS); + +/* QUESTION 4 */ + float *d_counts; + cudaMalloc((void **) &d_counts, BLOCKS * sizeof(float)); // allocate device mem. for counts +/* FIN QUESTION 4 */ + + t0 = get_elapsedtime(); +/* QUESTION 3 */ + gpu_monte_carlo<<<BLOCKS, THREADS>>>(d_counts); +/* FIN QUESTION 3*/ + +/* QUESTION 5 */ + cudaMemcpy(h_counts, d_counts, BLOCKS * sizeof(float), cudaMemcpyDeviceToHost); // return results +/* FIN QUESTION 5 */ + + float pi_gpu = 0.f; + for(int i = 0; i < BLOCKS; i++) + { + pi_gpu += h_counts[i]; + } + + pi_gpu /= BLOCKS * THREADS; + + t1 = get_elapsedtime(); + duration = (t1 - t0); + + printf("GPU pi calculated in %lf s.\n", duration); + fprintf(stdout, "Pi ~= %lf\n", pi_gpu); + + return 0; +} diff --git a/TDs/TD2/CORRECTION/pi_cuda_shared_2.cu b/TDs/TD2/CORRECTION/pi_cuda_shared_2.cu new file mode 100755 index 0000000000000000000000000000000000000000..6b8a10556e63b09eb366b4e209997d98612ce4a0 --- /dev/null +++ b/TDs/TD2/CORRECTION/pi_cuda_shared_2.cu @@ -0,0 +1,93 @@ +#include <stdlib.h> +#include <stdio.h> +#include <cuda.h> +#include <math.h> +#include <time.h> +#include <curand_kernel.h> + +#define gettime(t) clock_gettime(CLOCK_MONOTONIC_RAW, t) +#define get_sub_seconde(t) (1e-9*(double)t.tv_nsec) +/** return time in second +*/ +double get_elapsedtime(void) +{ + struct timespec st; + int err = gettime(&st); + if (err !=0) return 0; + return (double)st.tv_sec + get_sub_seconde(st); +} + +/* QUESTION 3 */ +#define TRIALS_PER_THREAD 4096 +#define BLOCKS 512 +#define THREADS 256 +/* FIN QUESTION 3*/ + +/* QUESTION 6 */ +__global__ void gpu_monte_carlo(float *estimate) { + unsigned int gtid = threadIdx.x + blockDim.x * blockIdx.x; + unsigned int tid = threadIdx.x; + int points_in_circle = 0; + float x = 0., y = 0.; + __shared__ float estimate_s[THREADS]; + __shared__ curandState states_s[THREADS]; + + curand_init(2020, gtid, 0, &states_s[tid]); // Initialize CURAND + + for(int i = 0; i < TRIALS_PER_THREAD; i++) + { + x = curand_uniform (&states_s[tid]); + y = curand_uniform (&states_s[tid]); + points_in_circle += (x*x + y*y <= 1.0f); // count if x & y is in the circle. + } + estimate_s[tid] = 4.0f * points_in_circle / (float) TRIALS_PER_THREAD; // return estimate of pi + __syncthreads(); + + for (unsigned int s=1; s < THREADS; s*=2) { + int index = 2 * s * tid; + if (index < THREADS) + estimate_s[tid] += estimate_s[tid + s]; + __syncthreads(); + } + + if (tid == 0) + estimate[blockIdx.x] = estimate_s[0]; +} +/* FIN QUESTION 6 */ + +int main (int argc, char *argv[]) { + float h_counts[BLOCKS * THREADS] = { 0 }; + double t0 = 0., t1 = 0., duration = 0.; + + printf("# of trials per thread = %d, # of blocks = %d, # of threads/block = %d.\n", TRIALS_PER_THREAD, BLOCKS, THREADS); + +/* QUESTION 4 */ + float *d_counts; + cudaMalloc((void **) &d_counts, BLOCKS * sizeof(float)); // allocate device mem. for counts +/* FIN QUESTION 4 */ + + t0 = get_elapsedtime(); +/* QUESTION 3 */ + gpu_monte_carlo<<<BLOCKS, THREADS>>>(d_counts); +/* FIN QUESTION 3*/ + +/* QUESTION 5 */ + cudaMemcpy(h_counts, d_counts, BLOCKS * sizeof(float), cudaMemcpyDeviceToHost); // return results +/* FIN QUESTION 5 */ + + float pi_gpu = 0.f; + for(int i = 0; i < BLOCKS; i++) + { + pi_gpu += h_counts[i]; + } + + pi_gpu /= BLOCKS * THREADS; + + t1 = get_elapsedtime(); + duration = (t1 - t0); + + printf("GPU pi calculated in %lf s.\n", duration); + fprintf(stdout, "Pi ~= %lf\n", pi_gpu); + + return 0; +} diff --git a/TDs/TD2/CORRECTION/pi_cuda_shared_3.cu b/TDs/TD2/CORRECTION/pi_cuda_shared_3.cu new file mode 100755 index 0000000000000000000000000000000000000000..8213f5af128395c5730800990fc8e5c9a2a2bf55 --- /dev/null +++ b/TDs/TD2/CORRECTION/pi_cuda_shared_3.cu @@ -0,0 +1,92 @@ +#include <stdlib.h> +#include <stdio.h> +#include <cuda.h> +#include <math.h> +#include <time.h> +#include <curand_kernel.h> + +#define gettime(t) clock_gettime(CLOCK_MONOTONIC_RAW, t) +#define get_sub_seconde(t) (1e-9*(double)t.tv_nsec) +/** return time in second +*/ +double get_elapsedtime(void) +{ + struct timespec st; + int err = gettime(&st); + if (err !=0) return 0; + return (double)st.tv_sec + get_sub_seconde(st); +} + +/* QUESTION 3 */ +#define TRIALS_PER_THREAD 4096 +#define BLOCKS 512 +#define THREADS 256 +/* FIN QUESTION 3*/ + +/* QUESTION 6 */ +__global__ void gpu_monte_carlo(float *estimate) { + unsigned int gtid = threadIdx.x + blockDim.x * blockIdx.x; + unsigned int tid = threadIdx.x; + int points_in_circle = 0; + float x = 0., y = 0.; + __shared__ float estimate_s[THREADS]; + __shared__ curandState states_s[THREADS]; + + curand_init(2020, gtid, 0, &states_s[tid]); // Initialize CURAND + + for(int i = 0; i < TRIALS_PER_THREAD; i++) + { + x = curand_uniform (&states_s[tid]); + y = curand_uniform (&states_s[tid]); + points_in_circle += (x*x + y*y <= 1.0f); // count if x & y is in the circle. + } + estimate_s[tid] = 4.0f * points_in_circle / (float) TRIALS_PER_THREAD; // return estimate of pi + __syncthreads(); + + for (unsigned int s=THREADS/2; s>0; s>>=1){ + if (tid < s) + estimate_s[tid] += estimate_s[tid + s]; + __syncthreads(); + } + + if (tid == 0) + estimate[blockIdx.x] = estimate_s[0]; +} +/* FIN QUESTION 6 */ + +int main (int argc, char *argv[]) { + float h_counts[BLOCKS * THREADS] = { 0 }; + double t0 = 0., t1 = 0., duration = 0.; + + printf("# of trials per thread = %d, # of blocks = %d, # of threads/block = %d.\n", TRIALS_PER_THREAD, BLOCKS, THREADS); + +/* QUESTION 4 */ + float *d_counts; + cudaMalloc((void **) &d_counts, BLOCKS * sizeof(float)); // allocate device mem. for counts +/* FIN QUESTION 4 */ + + t0 = get_elapsedtime(); +/* QUESTION 3 */ + gpu_monte_carlo<<<BLOCKS, THREADS>>>(d_counts); +/* FIN QUESTION 3*/ + +/* QUESTION 5 */ + cudaMemcpy(h_counts, d_counts, BLOCKS * sizeof(float), cudaMemcpyDeviceToHost); // return results +/* FIN QUESTION 5 */ + + float pi_gpu = 0.f; + for(int i = 0; i < BLOCKS; i++) + { + pi_gpu += h_counts[i]; + } + + pi_gpu /= BLOCKS * THREADS; + + t1 = get_elapsedtime(); + duration = (t1 - t0); + + printf("GPU pi calculated in %lf s.\n", duration); + fprintf(stdout, "Pi ~= %lf\n", pi_gpu); + + return 0; +} diff --git a/TDs/TD2/CORRECTION/pi_cuda_shared_4.cu b/TDs/TD2/CORRECTION/pi_cuda_shared_4.cu new file mode 100755 index 0000000000000000000000000000000000000000..3a92945d90f641cd3d890a794fa0057bc69c1594 --- /dev/null +++ b/TDs/TD2/CORRECTION/pi_cuda_shared_4.cu @@ -0,0 +1,86 @@ +#include <stdlib.h> +#include <stdio.h> +#include <cuda.h> +#include <math.h> +#include <time.h> +#include <curand_kernel.h> + +#define gettime(t) clock_gettime(CLOCK_MONOTONIC_RAW, t) +#define get_sub_seconde(t) (1e-9*(double)t.tv_nsec) +/** return time in second +*/ +double get_elapsedtime(void) +{ + struct timespec st; + int err = gettime(&st); + if (err !=0) return 0; + return (double)st.tv_sec + get_sub_seconde(st); +} + +/* QUESTION 3 */ +#define TRIALS_PER_THREAD 4096 +#define BLOCKS 512 +#define THREADS 256 +/* FIN QUESTION 3*/ + +/* QUESTION 6 */ +__global__ void gpu_monte_carlo(float *estimate) { + unsigned int gtid = threadIdx.x + blockDim.x * blockIdx.x; + unsigned int tid = threadIdx.x; + int points_in_circle = 0; + float x = 0., y = 0.; + __shared__ float estimate_s[THREADS]; + __shared__ curandState states_s[THREADS]; + + curand_init(2020, gtid, 0, &states_s[tid]); // Initialize CURAND + + for(int i = 0; i < TRIALS_PER_THREAD; i++) + { + x = curand_uniform (&states_s[tid]); + y = curand_uniform (&states_s[tid]); + points_in_circle += (x*x + y*y <= 1.0f); // count if x & y is in the circle. + } + estimate_s[tid] = 4.0f * points_in_circle / (float) TRIALS_PER_THREAD; // return estimate of pi + __syncthreads(); + + for (unsigned int s=THREADS/2; s > 0; s>>=1) { + if (tid < s) + estimate_s[tid] += estimate_s[tid + s]; + __syncthreads(); + } + + if (tid == 0) + atomicAdd(estimate, estimate_s[0]); +} +/* FIN QUESTION 6 */ + +int main (int argc, char *argv[]) { + float h_counts = 0; + double t0 = 0., t1 = 0., duration = 0.; + + printf("# of trials per thread = %d, # of blocks = %d, # of threads/block = %d.\n", TRIALS_PER_THREAD, BLOCKS, THREADS); + +/* QUESTION 4 */ + float *d_counts; + cudaMalloc((void **) &d_counts, sizeof(float)); // allocate device mem. for counts +/* FIN QUESTION 4 */ + + t0 = get_elapsedtime(); +/* QUESTION 3 */ + gpu_monte_carlo<<<BLOCKS, THREADS>>>(d_counts); +/* FIN QUESTION 3*/ + +/* QUESTION 5 */ + cudaMemcpy(&h_counts, d_counts, sizeof(float), cudaMemcpyDeviceToHost); // return results +/* FIN QUESTION 5 */ + + float pi_gpu = h_counts / (BLOCKS * THREADS); + + t1 = get_elapsedtime(); + duration = (t1 - t0); + + printf("GPU pi calculated in %lf s.\n", duration); + fprintf(stdout, "Pi ~= %lf\n", pi_gpu); + + return 0; +}