diff --git a/download_dataset.sh b/download_dataset.sh index ac74abdd9d945f5ec15fcad684534c68ed52c180..b5aa52d56196ee767219b7c395c9da8c8efe87e8 100755 --- a/download_dataset.sh +++ b/download_dataset.sh @@ -1,3 +1,3 @@ mkdir -p files/dataset -wget "https://storage.googleapis.com/kaggle-data-sets/3384322/6207733/bundle/archive.zip?X-Goog-Algorithm=GOOG4-RSA-SHA256&X-Goog-Credential=gcp-kaggle-com%40kaggle-161607.iam.gserviceaccount.com%2F20240519%2Fauto%2Fstorage%2Fgoog4_request&X-Goog-Date=20240519T145233Z&X-Goog-Expires=259200&X-Goog-SignedHeaders=host&X-Goog-Signature=1ac8f9216a239f62f3aa19666ce2b09c188d1d34d5199cf254a3677292e1b893eb10d0e2280baf0cbfb1f21d38a2b99f55e3e080beaa4a376d07326750503e15f35e123e2efd21c2c300a82c5bc06c787528bbe5e0d6b7be5a31bc0e6fb458b9a59456233fb852c658827d1dd547ca683890de508dd88940526568357bdd28611409ed5db0e479abf7b6f98855cd942d0cebfae55d463f288640c594bce7e11cd9f460e941cec80a7713e7faa54e69e3e9c4e9e3cd87b11bc35aa74439f96f80c2d592c6a97519353ca099d62e7276bec190a99e9327aee45ab9531d86f8f6be65fb3931148dbd4342712849494a71adcfe0b4eb54051582393fe8a98ebf68bc" -c -O 'files/dataset.zip' +wget --header="Host: storage.googleapis.com" --header="User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36" --header="Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,/;q=0.8,application/signed-exchange;v=b3;q=0.7" --header="Accept-Language: fr-FR,fr;q=0.9,en-US;q=0.8,en;q=0.7,zh-TW;q=0.6,zh-CN;q=0.5,zh;q=0.4" --header="Referer: https://www.kaggle.com/" "https://storage.googleapis.com/kaggle-data-sets/3384322/6207733/bundle/archive.zip?X-Goog-Algorithm=GOOG4-RSA-SHA256&X-Goog-Credential=gcp-kaggle-com%40kaggle-161607.iam.gserviceaccount.com%2F20240519%2Fauto%2Fstorage%2Fgoog4_request&X-Goog-Date=20240519T145233Z&X-Goog-Expires=259200&X-Goog-SignedHeaders=host&X-Goog-Signature=1ac8f9216a239f62f3aa19666ce2b09c188d1d34d5199cf254a3677292e1b893eb10d0e2280baf0cbfb1f21d38a2b99f55e3e080beaa4a376d07326750503e15f35e123e2efd21c2c300a82c5bc06c787528bbe5e0d6b7be5a31bc0e6fb458b9a59456233fb852c658827d1dd547ca683890de508dd88940526568357bdd28611409ed5db0e479abf7b6f98855cd942d0cebfae55d463f288640c594bce7e11cd9f460e941cec80a7713e7faa54e69e3e9c4e9e3cd87b11bc35aa74439f96f80c2d592c6a97519353ca099d62e7276bec190a99e9327aee45ab9531d86f8f6be65fb3931148dbd4342712849494a71adcfe0b4eb54051582393fe8a98ebf68bc" -c -O 'archive.zip' unzip dataset.zip -d files/dataset diff --git a/DB_commands.sql b/files/DB_commands.sql similarity index 100% rename from DB_commands.sql rename to files/DB_commands.sql diff --git a/rapport.md b/rapport.md deleted file mode 100644 index 28d98f23aaa6100494f3b0c333dbfc9a86ae7a6e..0000000000000000000000000000000000000000 --- a/rapport.md +++ /dev/null @@ -1,78 +0,0 @@ ---- -title: "Rapport du Projet d'IPBD" -author: - - Alexandre FORT - - Tessa DEPAOLI - - Xavier DE WEERD -lang: fr -geometry: "top=2cm, bottom=2cm, left=2cm, right=2cm" -fontsize: 12pt -linkcolor: "blue" -toc: true -toc-depth: 3 -titlepage: true -papersize: "a4paper" ---- -# Analyse des tendances sur les animés - -## Objectif et mise en place - -### Description des données et objectif - -Le dataset que nous avons choisi est le suivant : https://www.kaggle.com/datasets/dbdmobile/myanimelist-dataset/. - -Il s'agit de données sur les animés, les profils d'utilisateurs et les notations des utilisateurs de la plateforme [MyAnimeList](https://myanimelist.net/). - -Notre but est d'analyser ces données afin de trouver des tendances ou/et corrélation dans les données. - -Pour cela, on utilisera les outils présentés pendant les différents ateliers en cherchant à en exposer de nouveaux usages : -- Docker -- Hive -- Spark -- OpenSearch -- OpenSearch Dashboards - -### Mise en place - -On récupère le git du projet, contenant les scripts pour lancer le cluster Hive, Hadoop et OpenSearch : -```sh -cd ~ -git clone -``` - -On récupère les données en faisant une requête wget : -```shell -wget "https://storage.googleapis.com/kaggle-data-sets/3384322/6207733/bundle/archive.zip?X-Goog-Algorithm=GOOG4-RSA-SHA256&X-Goog-Credential=gcp-kaggle-com%40kaggle-161607.iam.gserviceaccount.com%2F20240519%2Fauto%2Fstorage%2Fgoog4_request&X-Goog-Date=20240519T145233Z&X-Goog-Expires=259200&X-Goog-SignedHeaders=host&X-Goog-Signature=1ac8f9216a239f62f3aa19666ce2b09c188d1d34d5199cf254a3677292e1b893eb10d0e2280baf0cbfb1f21d38a2b99f55e3e080beaa4a376d07326750503e15f35e123e2efd21c2c300a82c5bc06c787528bbe5e0d6b7be5a31bc0e6fb458b9a59456233fb852c658827d1dd547ca683890de508dd88940526568357bdd28611409ed5db0e479abf7b6f98855cd942d0cebfae55d463f288640c594bce7e11cd9f460e941cec80a7713e7faa54e69e3e9c4e9e3cd87b11bc35aa74439f96f80c2d592c6a97519353ca099d62e7276bec190a99e9327aee45ab9531d86f8f6be65fb3931148dbd4342712849494a71adcfe0b4eb54051582393fe8a98ebf68bc" -c -O 'archive.zip' -mkdir dataset -unzip archive.zip -d dataset -``` - -## 1. Chargement des données sur HDFS - -On créer les containeurs avec docker-compose et le fichier de configuration `~/` -```yaml -services: - namenode: - image: bde2020/hadoop-namenode:2.0.0-hadoop3.2.1-java8 - networks: - - hadoop - container_name: namenode - volumes: - - hadoop_namenode:/hadoop/dfs/name - - ~/ensiie/exo/files:/data/hdfs/formation - - ~/ensiie/tpnote:/data/hdfs/tpnote - environment: - - CLUSTER_NAME=test - env_file: - - ./hadoop.env - deploy: - mode: replicated - replicas: 1 - placement: - constraints: - - node.hostname == akswnc4.aksw.uni-leipzig.de - ports: - - 9870:9870 - - 9000:9000 -``` - diff --git a/google.txt b/rapport/google.txt similarity index 100% rename from google.txt rename to rapport/google.txt diff --git a/rapport-karnas.md b/rapport/rapport.md similarity index 100% rename from rapport-karnas.md rename to rapport/rapport.md diff --git a/sake.txt b/rapport/sake.txt similarity index 97% rename from sake.txt rename to rapport/sake.txt index f39b5af02595936e69cea3abb9a09c22450ca36e..c655bc7bb6c5c57c3b9684520cdfb8e8fe3913e0 100644 --- a/sake.txt +++ b/rapport/sake.txt @@ -181,7 +181,7 @@ nc -zv hive-server 10000 Nouvelle commande : /opt/hive/bin/beeline -u jdbc:hive2://hive-server:10000 -Il faut donc maintenant construire la BD -> cf DB_commands +Il faut donc maintenant construire la BD -> cf DB_commands.sql