From 96ba9e3c8d7319e4a9b366a663fcc04bf210beb6 Mon Sep 17 00:00:00 2001
From: karnas <xavier.deweerd@ensiie.eu>
Date: Mon, 20 May 2024 15:28:50 +0000
Subject: [PATCH] added script to dl dataset

---
 .gitignore          |   3 ++
 download_dataset.sh |   3 ++
 rapport.txt         | 122 --------------------------------------------
 3 files changed, 6 insertions(+), 122 deletions(-)
 create mode 100644 .gitignore
 create mode 100755 download_dataset.sh
 delete mode 100644 rapport.txt

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..c76ddb3
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,3 @@
+dataset.zip
+dataset/*
+
diff --git a/download_dataset.sh b/download_dataset.sh
new file mode 100755
index 0000000..2c1c5d1
--- /dev/null
+++ b/download_dataset.sh
@@ -0,0 +1,3 @@
+wget "https://storage.googleapis.com/kaggle-data-sets/3384322/6207733/bundle/archive.zip?X-Goog-Algorithm=GOOG4-RSA-SHA256&X-Goog-Credential=gcp-kaggle-com%40kaggle-161607.iam.gserviceaccount.com%2F20240519%2Fauto%2Fstorage%2Fgoog4_request&X-Goog-Date=20240519T145233Z&X-Goog-Expires=259200&X-Goog-SignedHeaders=host&X-Goog-Signature=1ac8f9216a239f62f3aa19666ce2b09c188d1d34d5199cf254a3677292e1b893eb10d0e2280baf0cbfb1f21d38a2b99f55e3e080beaa4a376d07326750503e15f35e123e2efd21c2c300a82c5bc06c787528bbe5e0d6b7be5a31bc0e6fb458b9a59456233fb852c658827d1dd547ca683890de508dd88940526568357bdd28611409ed5db0e479abf7b6f98855cd942d0cebfae55d463f288640c594bce7e11cd9f460e941cec80a7713e7faa54e69e3e9c4e9e3cd87b11bc35aa74439f96f80c2d592c6a97519353ca099d62e7276bec190a99e9327aee45ab9531d86f8f6be65fb3931148dbd4342712849494a71adcfe0b4eb54051582393fe8a98ebf68bc" -c -O 'dataset.zip'
+mkdir dataset
+unzip dataset.zip -d dataset
diff --git a/rapport.txt b/rapport.txt
deleted file mode 100644
index b13e01b..0000000
--- a/rapport.txt
+++ /dev/null
@@ -1,122 +0,0 @@
-rapport : 
-
-
-Partie 1 : Hadoop
-
-cd ensiie/exo/opensearch
-vi recup-bano-opensearch
- 
--> en dessous du commentaire, i in {91..93}
-on vérifie le contenu de bano-data : i y a bien les 3 départements demandés
-
-
-nouveau terminal 
-cd ~/ensiie/exo/hadoop/
-./start-hadoop-ensiie.sh
-•Attendre environ 20 secondes et le relancer une seconde fois : 
-./start-hadoop-ensiie.sh
-
-
-docker run -it -v /home/tessa/ensiie/exo/opensearch/bano-data:/ensiie/tpnote namenode bash  -> ça marche pas pck déjà un volume
-
-
-cp -r ~/ensiie/exo/opensearch/bano-data ~/ensiie/exo/files pck montage déjà fait (sinon docker cp)
-
-docker exec -it namenode bash
-
-cd /data/hdfs/formation/bano-data
-cf copie écran
-
-hdfs dfs -put /data/hdfs/formation/bano-data /ensiie/tpnote/
-cf screen -> pb bizarre avec le /, ça trouvait pas mes fichiers sur hive
-
-
-
-partie 2 : Hive
-
-docker exec -it hive-server bash
-(une fois dans le conteneur)
-/opt/hive/bin/beeline -u jdbc:hive2://hive-server:10000
-
-1)
-hive>
-CREATE DATABASE IF NOT EXISTS ensiie;
-USE ensiie;
-
-2)
-CREATE [TEMPORARY] [EXTERNAL] TABLE [IF NOT EXISTS] [db_name.] table_name
-[(col_name data_type [COMMENT col_comment], ...)]
-[COMMENT table_comment]
-[ROW FORMAT row_format]
-[STORED AS file_format]
-
-cf site bano
-
-CREATE EXTERNAL TABLE IF NOT EXISTS bano
-( id STRING,
-  numero STRING, 
-  voie STRING,
-code_postal INT,
-nom_commune STRING,
-source STRING,
-latitude DOUBLE,
-longitude DOUBLE)
-ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
-WITH SERDEPROPERTIES (
-   "separatorChar" = ",",
-   "quoteChar"     = "\""
-)
-STORED AS textfile
-LOCATION '/ensiie/tpnote/bano-data';
-  
-
-SELECT * FROM bano LIMIT 10;
-
-cf screen
-
-
-3)
-CREATE TABLE IF NOT EXISTS banorc
-( id STRING,
-  numero STRING, 
-  voie STRING,
-code_postal INT,
-nom_commune STRING,
-source STRING,
-latitude DOUBLE,
-longitude DOUBLE)
-STORED AS ORC;
-INSERT INTO TABLE banorc
-SELECT * FROM bano;
-
-4) 
-select count (*) from bano ; 
-select count (*) from banorc ;
-
-756087 
-1,364s VS 0,11s
-
-5)
-SELECT SUBSTR(id, 1, 2) AS departement, COUNT(DISTINCT nom_commune) AS nombre_de_communes
-FROM bano
-GROUP BY SUBSTR(id, 1, 2)
-ORDER BY departement;
-
-select count(distinct nom_commune) from bano;
-
-
-Spark : 
-cd /spark/bin
-./spark-shell --driver-memory 2G 
-
-val lines = sc.textFile("hdfs://namenode:9000/ensiie/tpnote/bano-data/bano-91.csv")
-val words = lines.flatMap(_.split("\\s+"))
-val wc = words.map(w => (w, 1)).reduceByKey(_ + _)
-val sorted_words = wc.sortBy(-_._2).take(30)
-
-
-3) OpenSearch Dashboard
-
-cd ensiie/exo/hadoop/
-./stop-hadoop-ensiie.sh
-cd ../opensearch 
-- 
GitLab