Skip to content
Extraits de code Groupes Projets
Valider c92343a3 rédigé par Karnas's avatar Karnas
Parcourir les fichiers

fixed sql queries further and added script to fix csv

parent eb2d1a8a
Aucune branche associée trouvée
Aucune étiquette associée trouvée
Aucune requête de fusion associée trouvée
CREATE DATABASE IF NOT EXISTS myanimelist_db;
USE myanimelist_db;
CREATE EXTERNAL TABLE IF NOT EXISTS score (
user_id INT,
anime_id INT,
rating INT
)
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
WITH SERDEPROPERTIES (
"separatorChar" = ',',
"quoteChar" = '\"',
"escapeChar" = '\\'
)
TBLPROPERTIES ("skip.header.line.count"="1")
LOCATION '/dataset/score';
CREATE EXTERNAL TABLE IF NOT EXISTS anime (
anime_id INT,
name VARCHAR(255),
english_name VARCHAR(255),
other_name VARCHAR(255),
score DECIMAL(3,2),
genres VARCHAR(255),
synopsis STRING,
type VARCHAR(50),
episodes INT,
aired VARCHAR(255),
premiered VARCHAR(50),
status VARCHAR(50),
producers STRING,
licensors STRING,
studios STRING,
source VARCHAR(100),
duration VARCHAR(50),
rating VARCHAR(50),
rank INT,
popularity INT,
favorites INT,
scored_by INT,
members INT,
image_url STRING
)
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
WITH SERDEPROPERTIES (
"separatorChar" = ',',
"quoteChar" = '\"',
"escapeChar" = '\\'
)
TBLPROPERTIES ("skip.header.line.count"="1")
LOCATION '/dataset/anime';
CREATE EXTERNAL TABLE IF NOT EXISTS users (
mal_id INT,
username STRING,
gender STRING,
birthday TIMESTAMP,
location STRING,
joined TIMESTAMP,
days_watched FLOAT,
mean_score FLOAT,
watching FLOAT,
completed FLOAT,
on_hold FLOAT,
dropped FLOAT,
plan_to_watch FLOAT,
total_entries FLOAT,
rewatched FLOAT,
episodes_watched FLOAT
)
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
WITH SERDEPROPERTIES (
"separatorChar" = ',',
"quoteChar" = '\"',
"escapeChar" = '\\'
)
TBLPROPERTIES ("skip.header.line.count"="1")
LOCATION '/dataset/users';
Fichier déplacé
#!/usr/bin/env python3
import pandas as pd
import sys
import os
def replace_newlines(input_file):
# Vérifier si le fichier existe
if not os.path.isfile(input_file):
print(f"Le fichier {input_file} n'existe pas.")
return
# Charger le fichier CSV
df = pd.read_csv(input_file)
# Remplacer les retours à la ligne par "\\n" dans chaque colonne
df.replace(to_replace=r'\r?\n', value=r'\\n', regex=True, inplace=True)
# Construire le nom du fichier de sortie
base, ext = os.path.splitext(input_file)
output_file = f"{base}_modified{ext}"
# Sauvegarder le fichier modifié
df.to_csv(output_file, index=False)
print(f"Fichier modifié sauvegardé sous le nom : {output_file}")
if __name__ == "__main__":
if len(sys.argv) < 2:
print("Usage: python replace_newlines.py <file1.csv> <file2.csv> ...")
else:
for input_file in sys.argv[1:]:
replace_newlines(input_file)
...@@ -8,12 +8,11 @@ CREATE EXTERNAL TABLE IF NOT EXISTS score ( ...@@ -8,12 +8,11 @@ CREATE EXTERNAL TABLE IF NOT EXISTS score (
anime_id INT, anime_id INT,
rating INT rating INT
) )
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde' ROW FORMAT DELIMITED
WITH SERDEPROPERTIES ( FIELDS TERMINATED BY ','
"separatorChar" = ',', LINES TERMINATED BY '\n'
"quoteChar" = '\"' STORED AS TEXTFILE
) LOCATION '/dataset/';
LOCATION '/dataset/score';
CREATE EXTERNAL TABLE IF NOT EXISTS anime ( CREATE EXTERNAL TABLE IF NOT EXISTS anime (
...@@ -42,12 +41,11 @@ CREATE EXTERNAL TABLE IF NOT EXISTS anime ( ...@@ -42,12 +41,11 @@ CREATE EXTERNAL TABLE IF NOT EXISTS anime (
members INT, members INT,
image_url STRING image_url STRING
) )
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde' ROW FORMAT DELIMITED
WITH SERDEPROPERTIES ( FIELDS TERMINATED BY ','
"separatorChar" = ',', LINES TERMINATED BY '\n'
"quoteChar" = '\"' STORED AS TEXTFILE
) LOCATION '/dataset/';
LOCATION '/dataset/anime';
CREATE EXTERNAL TABLE IF NOT EXISTS users ( CREATE EXTERNAL TABLE IF NOT EXISTS users (
mal_id INT, mal_id INT,
...@@ -67,12 +65,11 @@ CREATE EXTERNAL TABLE IF NOT EXISTS users ( ...@@ -67,12 +65,11 @@ CREATE EXTERNAL TABLE IF NOT EXISTS users (
rewatched FLOAT, rewatched FLOAT,
episodes_watched FLOAT episodes_watched FLOAT
) )
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde' ROW FORMAT DELIMITED
WITH SERDEPROPERTIES ( FIELDS TERMINATED BY ','
"separatorChar" = ',', LINES TERMINATED BY '\n'
"quoteChar" = '\"' STORED AS TEXTFILE
) LOCATION '/dataset/';
LOCATION '/dataset/users';
......
0% Chargement en cours ou .
You are about to add 0 people to the discussion. Proceed with caution.
Terminez d'abord l'édition de ce message.
Veuillez vous inscrire ou vous pour commenter