From c92343a3b3771196620e97e4b1c1ed7b0d58dad9 Mon Sep 17 00:00:00 2001 From: karnas <xavier.deweerd@ensiie.eu> Date: Wed, 22 May 2024 05:07:35 +0200 Subject: [PATCH] fixed sql queries further and added script to fix csv --- files/scripts/init_db.sql | 85 +++++++++++++++++++ files/{ => scripts}/others_tables.sql | 0 files/scripts/replace_newlines.py | 33 +++++++ .../wrong_init_db.sql} | 33 ++++--- 4 files changed, 133 insertions(+), 18 deletions(-) create mode 100644 files/scripts/init_db.sql rename files/{ => scripts}/others_tables.sql (100%) create mode 100755 files/scripts/replace_newlines.py rename files/{init_db.sql => scripts/wrong_init_db.sql} (65%) diff --git a/files/scripts/init_db.sql b/files/scripts/init_db.sql new file mode 100644 index 0000000..2b68dc1 --- /dev/null +++ b/files/scripts/init_db.sql @@ -0,0 +1,85 @@ +CREATE DATABASE IF NOT EXISTS myanimelist_db; + +USE myanimelist_db; + + +CREATE EXTERNAL TABLE IF NOT EXISTS score ( + user_id INT, + anime_id INT, + rating INT +) +ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde' +WITH SERDEPROPERTIES ( + "separatorChar" = ',', + "quoteChar" = '\"', + "escapeChar" = '\\' +) +TBLPROPERTIES ("skip.header.line.count"="1") +LOCATION '/dataset/score'; + + +CREATE EXTERNAL TABLE IF NOT EXISTS anime ( + anime_id INT, + name VARCHAR(255), + english_name VARCHAR(255), + other_name VARCHAR(255), + score DECIMAL(3,2), + genres VARCHAR(255), + synopsis STRING, + type VARCHAR(50), + episodes INT, + aired VARCHAR(255), + premiered VARCHAR(50), + status VARCHAR(50), + producers STRING, + licensors STRING, + studios STRING, + source VARCHAR(100), + duration VARCHAR(50), + rating VARCHAR(50), + rank INT, + popularity INT, + favorites INT, + scored_by INT, + members INT, + image_url STRING +) +ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde' +WITH SERDEPROPERTIES ( + "separatorChar" = ',', + "quoteChar" = '\"', + "escapeChar" = '\\' +) +TBLPROPERTIES ("skip.header.line.count"="1") +LOCATION '/dataset/anime'; + +CREATE EXTERNAL TABLE IF NOT EXISTS users ( + mal_id INT, + username STRING, + gender STRING, + birthday TIMESTAMP, + location STRING, + joined TIMESTAMP, + days_watched FLOAT, + mean_score FLOAT, + watching FLOAT, + completed FLOAT, + on_hold FLOAT, + dropped FLOAT, + plan_to_watch FLOAT, + total_entries FLOAT, + rewatched FLOAT, + episodes_watched FLOAT +) +ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde' +WITH SERDEPROPERTIES ( + "separatorChar" = ',', + "quoteChar" = '\"', + "escapeChar" = '\\' +) +TBLPROPERTIES ("skip.header.line.count"="1") +LOCATION '/dataset/users'; + + + + diff --git a/files/others_tables.sql b/files/scripts/others_tables.sql similarity index 100% rename from files/others_tables.sql rename to files/scripts/others_tables.sql diff --git a/files/scripts/replace_newlines.py b/files/scripts/replace_newlines.py new file mode 100755 index 0000000..a4d559d --- /dev/null +++ b/files/scripts/replace_newlines.py @@ -0,0 +1,33 @@ +#!/usr/bin/env python3 + +import pandas as pd +import sys +import os + +def replace_newlines(input_file): + # Vérifier si le fichier existe + if not os.path.isfile(input_file): + print(f"Le fichier {input_file} n'existe pas.") + return + + # Charger le fichier CSV + df = pd.read_csv(input_file) + + # Remplacer les retours à la ligne par "\\n" dans chaque colonne + df.replace(to_replace=r'\r?\n', value=r'\\n', regex=True, inplace=True) + + # Construire le nom du fichier de sortie + base, ext = os.path.splitext(input_file) + output_file = f"{base}_modified{ext}" + + # Sauvegarder le fichier modifié + df.to_csv(output_file, index=False) + print(f"Fichier modifié sauvegardé sous le nom : {output_file}") + +if __name__ == "__main__": + if len(sys.argv) < 2: + print("Usage: python replace_newlines.py <file1.csv> <file2.csv> ...") + else: + for input_file in sys.argv[1:]: + replace_newlines(input_file) + diff --git a/files/init_db.sql b/files/scripts/wrong_init_db.sql similarity index 65% rename from files/init_db.sql rename to files/scripts/wrong_init_db.sql index d71ad30..1fdc1ea 100644 --- a/files/init_db.sql +++ b/files/scripts/wrong_init_db.sql @@ -8,12 +8,11 @@ CREATE EXTERNAL TABLE IF NOT EXISTS score ( anime_id INT, rating INT ) -ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde' -WITH SERDEPROPERTIES ( - "separatorChar" = ',', - "quoteChar" = '\"' -) -LOCATION '/dataset/score'; +ROW FORMAT DELIMITED +FIELDS TERMINATED BY ',' +LINES TERMINATED BY '\n' +STORED AS TEXTFILE +LOCATION '/dataset/'; CREATE EXTERNAL TABLE IF NOT EXISTS anime ( @@ -42,12 +41,11 @@ CREATE EXTERNAL TABLE IF NOT EXISTS anime ( members INT, image_url STRING ) -ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde' -WITH SERDEPROPERTIES ( - "separatorChar" = ',', - "quoteChar" = '\"' -) -LOCATION '/dataset/anime'; +ROW FORMAT DELIMITED +FIELDS TERMINATED BY ',' +LINES TERMINATED BY '\n' +STORED AS TEXTFILE +LOCATION '/dataset/'; CREATE EXTERNAL TABLE IF NOT EXISTS users ( mal_id INT, @@ -67,12 +65,11 @@ CREATE EXTERNAL TABLE IF NOT EXISTS users ( rewatched FLOAT, episodes_watched FLOAT ) -ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde' -WITH SERDEPROPERTIES ( - "separatorChar" = ',', - "quoteChar" = '\"' -) -LOCATION '/dataset/users'; +ROW FORMAT DELIMITED +FIELDS TERMINATED BY ',' +LINES TERMINATED BY '\n' +STORED AS TEXTFILE +LOCATION '/dataset/'; -- GitLab