Skip to content
Extraits de code Groupes Projets
Valider bcde70ef rédigé par Nicolas MARIE's avatar Nicolas MARIE
Parcourir les fichiers

TP2

parent 08448aef
Aucune branche associée trouvée
Aucune étiquette associée trouvée
Aucune requête de fusion associée trouvée
CC=gcc
CFLAGS=-O3 -Wall
NVCC=nvcc
CFLAGS=-O3 -Wall -Wextra
NVFLAGS=-O3 -I . -Xcompiler="-Wall -Wextra -Werror -O3"
LDFLAGS=-lm
EXE=tp2.exe
TARGET=tp2cpu
TARGETGPU=tp2gpu
all : $(EXE)
all: $(TARGET) $(TARGETGPU)
$(EXE) : tp2.o
$(TARGET): tp2_cpu.o
$(CC) $(CFLAGS) -o $@ $< $(LDFLAGS)
%.o : %.c
$(TARGETGPU): tp2_gpu.o
$(NVCC) $(NVFLAGS) -o $@ $< $(LDFLAGS)
%_cpu.o: %.c
$(CC) $(CFLAGS) -c -o $@ $<
run : $(EXE)
./$< 10240
%_gpu.o: %.cu
$(NVCC) $(NVFLAGS) -c -o $@ $<
run: $(TARGET) $(TARGETGPU)
./$(TARGET) 10240
./$(TARGETGPU) 10240
proper :
rm -f *.o
clean :
rm -f *.o $(EXE)
rm -f *.o $(TARGET) $(TARGETGPU)
/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of NVIDIA CORPORATION nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
////////////////////////////////////////////////////////////////////////////////
// These are CUDA Helper functions for initialization and error checking
#ifndef COMMON_HELPER_CUDA_H_
#define COMMON_HELPER_CUDA_H_
#pragma once
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#ifndef MAX
#define MAX(a, b) (a > b ? a : b)
#endif
#ifndef MIN
#define MIN(a, b) (a < b ? a : b)
#endif
#define checkCudaErrors(val) check((val), #val, __FILE__, __LINE__)
#define getLastCudaError(msg) __getLastCudaError(msg, __FILE__, __LINE__)
void check(cudaError_t result, char const *const func, const char *const file,
int const line)
{
if (result)
{
fprintf(stderr, "CUDA error at %s:%d code=%d (%s) \"%s\" \n", file, line, (int)result, cudaGetErrorName(result), func);
exit(EXIT_FAILURE);
}
}
inline void __getLastCudaError(const char *errorMessage, const char *file,
const int line)
{
cudaError_t err = cudaGetLastError();
if (cudaSuccess != err)
{
fprintf(stderr,
"%s(%i) : getLastCudaError() CUDA error :"
" %s : (%d) %s.\n",
file, line, errorMessage, (int)(err),
cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
}
// Beginning of GPU Architecture definitions
int _ConvertSMVer2Cores(int major, int minor)
{
// Defines for GPU Architecture types (using the SM version to determine
// the # of cores per SM
typedef struct
{
int SM; // 0xMm (hexidecimal notation), M = SM Major version,
// and m = SM minor version
int Cores;
} sSMtoCores;
sSMtoCores nGpuArchCoresPerSM[] = {
{0x30, 192},
{0x32, 192},
{0x35, 192},
{0x37, 192},
{0x50, 128},
{0x52, 128},
{0x53, 128},
{0x60, 64},
{0x61, 128},
{0x62, 128},
{0x70, 64},
{0x72, 64},
{0x75, 64},
{0x80, 64},
{0x86, 128},
{0x87, 128},
{-1, -1}};
int index = 0;
while (nGpuArchCoresPerSM[index].SM != -1)
{
if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor))
{
return nGpuArchCoresPerSM[index].Cores;
}
index++;
}
// If we don't find the values, we default use the previous one
// to run properly
printf(
"MapSMtoCores for SM %d.%d is undefined."
" Default to use %d Cores/SM\n",
major, minor, nGpuArchCoresPerSM[index - 1].Cores);
return nGpuArchCoresPerSM[index - 1].Cores;
}
inline const char *_ConvertSMVer2ArchName(int major, int minor)
{
// Defines for GPU Architecture types (using the SM version to determine
// the GPU Arch name)
typedef struct
{
int SM; // 0xMm (hexidecimal notation), M = SM Major version,
// and m = SM minor version
const char *name;
} sSMtoArchName;
sSMtoArchName nGpuArchNameSM[] = {
{0x30, "Kepler"},
{0x32, "Kepler"},
{0x35, "Kepler"},
{0x37, "Kepler"},
{0x50, "Maxwell"},
{0x52, "Maxwell"},
{0x53, "Maxwell"},
{0x60, "Pascal"},
{0x61, "Pascal"},
{0x62, "Pascal"},
{0x70, "Volta"},
{0x72, "Xavier"},
{0x75, "Turing"},
{0x80, "Ampere"},
{0x86, "Ampere"},
{-1, "Graphics Device"}};
int index = 0;
while (nGpuArchNameSM[index].SM != -1)
{
if (nGpuArchNameSM[index].SM == ((major << 4) + minor))
{
return nGpuArchNameSM[index].name;
}
index++;
}
// If we don't find the values, we default use the previous one
// to run properly
printf(
"MapSMtoArchName for SM %d.%d is undefined."
" Default to use %s\n",
major, minor, nGpuArchNameSM[index - 1].name);
return nGpuArchNameSM[index - 1].name;
}
// end of GPU Architecture definitions
// end of CUDA Helper Functions
#endif // COMMON_HELPER_CUDA_H_
\ No newline at end of file
#include <stdlib.h>
#include <stdio.h>
#include <math.h>
#include <time.h>
#include <inttypes.h>
#include "helper_cuda.h"
#define gettime(t) clock_gettime(CLOCK_MONOTONIC_RAW, t)
#define get_sub_seconde(t) (1e-9*(double)t.tv_nsec)
#define NSTREAMS 4
double get_elapsedtime(void)
{
struct timespec st;
int err = gettime(&st);
if (err !=0) return 0;
return (double)st.tv_sec + get_sub_seconde(st);
}
void verif(float *out, int sz)
{
float err = 0.;
for(int i = 0 ; i < sz ; i++)
{
err += abs(out[i] - exp( - abs(sin(i * 1.0)) ));
}
if (err/sz < 1.e-4)
{
fprintf(stdout, "TEST PASSED (error %3.f < 1.e-4)\n", err/sz);
}
else
{
fprintf(stderr, "TEST FAILED (error %3.f > 1.e-4)\n", err/sz);
}
float err = 0.;
for(int i = 0 ; i < sz ; i++)
{
err += abs(out[i] - exp( - abs(sin(i * 1.0)) ));
}
if (err/sz < 1.e-4)
{
fprintf(stdout, "TEST PASSED (error %3.5f < 1.e-4)\n", err/sz);
}
else
{
fprintf(stderr, "TEST FAILED (error %3.5f > 1.e-4)\n", err/sz);
}
}
void func(float *out, int size)
{
for(int i = 0; i < size; ++i)
{
out[i] = exp( - abs(out[i]) );
}
for(int i = 0; i < size; ++i)
{
out[i] = exp( - abs(out[i]) );
}
}
__global__ void funck(float *out, int size)
{
int index = blockIdx.x * blockDim.x + threadIdx.x;
if (index >= size)
return;
out[index] = exp( - abs(out[index]));
}
int main(int argc, char** argv)
{
int size = 1024;
if (argc == 2)
{
size = atoi(argv[1]);
}
int size = 1024;
if (argc >= 2)
{
size = atoi(argv[1]);
}
int nstreams = NSTREAMS;
if (argc >= 3)
{
nstreams = atoi(argv[2]);
}
int subsize = size / nstreams;
cudaStream_t streams[nstreams];
for (int i = 0; i < nstreams; ++i)
{
checkCudaErrors(cudaStreamCreate(&streams[i]));
}
float *tab_h = NULL;
float *tab_d = NULL;
checkCudaErrors(cudaMallocHost((void **) &tab_h, sizeof(float) * size));
checkCudaErrors(cudaMalloc((void **) &tab_d, sizeof(float) * size));
for(int i = 0; i < size; ++i)
{
tab_h[i] = sin(i * 1.);
}
//func(tab_h, size);
//verif(tab_h, size);
int blocksize = 1024;
dim3 dimBlock(blocksize, 1, 1);
dim3 dimGrid((size / blocksize / nstreams) + 1, 1, 1);
double t0, t1;
cudaEvent_t ce[nstreams * 2];
for(int i = 0; i < nstreams * 2; ++i)
{
cudaEventCreate(&ce[i]);
}
t0 = get_elapsedtime();
size *= NSTREAMS;
for(int i = 0; i < nstreams; ++i)
{
checkCudaErrors(cudaMemcpyAsync(
tab_d + (subsize * i), tab_h + (subsize * i),
sizeof(float) * subsize,
cudaMemcpyHostToDevice, streams[i]));
cudaEventRecord(ce[i * 2]);
funck<<<dimGrid, dimBlock, 0, streams[i]>>>(
tab_d + (subsize * i), subsize);
cudaEventRecord(ce[i * 2 + 1]);
checkCudaErrors(cudaMemcpyAsync(
tab_h + (subsize * i), tab_d + (subsize * i),
sizeof(float) * subsize,
cudaMemcpyDeviceToHost, streams[i]));
}
float *tab = NULL;
tab = (float*) malloc(sizeof(float) * size);
cudaDeviceSynchronize();
getLastCudaError("funck kernel failed");
float d_tab;
cudaMalloc(
t1 = get_elapsedtime();
if(tab == NULL)
{
fprintf(stderr, "Bad allocation\n");
return -1;
}
fprintf(stdout, "Total Time (%d): %lf ms\n", nstreams, (t1 - t0) * 1000);
float cudaDuration, cudaTotalDuration = 0;
for(int i = 0; i < nstreams; ++i){
cudaEventElapsedTime(&cudaDuration, ce[i * 2], ce[i * 2 + 1]);
cudaTotalDuration += cudaDuration;
fprintf(stdout, "Time of stream %d: %lf ms\n", i, cudaDuration);
}
fprintf(stdout, "Total Time of streams: %lf ms\n", cudaTotalDuration);
for(int i = 0; i < size; ++i)
{
tab[i] = sin(i * 1.);
}
verif(tab_h, size);
func(tab, size);
checkCudaErrors(cudaFree(tab_d));
cudaFreeHost(tab_h);
verif(tab, size);
for (int i = 0; i < nstreams; ++i)
{
checkCudaErrors(cudaStreamDestroy(streams[i]));
}
free(tab);
return 0;
return 0;
}
0% Chargement en cours ou .
You are about to add 0 people to the discussion. Proceed with caution.
Terminez d'abord l'édition de ce message.
Veuillez vous inscrire ou vous pour commenter