diff --git a/rapport/images/genre_in_years.png b/rapport/images/genre_in_years.png
new file mode 100644
index 0000000000000000000000000000000000000000..f0a7383abebd349164e41ff0b28ad17e5fbbe382
Binary files /dev/null and b/rapport/images/genre_in_years.png differ
diff --git a/rapport/images/users_by_country.png b/rapport/images/users_by_country.png
new file mode 100644
index 0000000000000000000000000000000000000000..4822c9c47af31f1e8f25f9326b4f157fa4832290
Binary files /dev/null and b/rapport/images/users_by_country.png differ
diff --git a/treat.py b/treat.py
new file mode 100755
index 0000000000000000000000000000000000000000..9089254638ba504b1bbb62d71d5e5a76a7153f4a
--- /dev/null
+++ b/treat.py
@@ -0,0 +1,132 @@
+import pandas as pd
+import matplotlib.pyplot as plt
+import seaborn as sns
+
+df = pd.read_csv('files/dataset/anime-dataset-2023.csv')
+df_user_filtered = pd.read_csv('files/dataset/user-filtered.csv')
+df_detail = pd.read_csv('files/dataset/users-details-2023.csv')
+
+
+
+# convert numeric columns to numeric. They became "objects" because they contained "UNKNOWN". That's stupid... assign -1 to "UNKNOWN"s
+#print(df.columns)
+
+float_col = ["Score"]
+int_col = ["Popularity","Favorites", "Members","Rank","Episodes","Scored By"]
+df[float_col+int_col] = df[float_col+int_col].replace("UNKNOWN","-1")
+df[float_col+int_col] = df[float_col+int_col].astype("float64")
+df[int_col] = df[int_col].astype("int64")
+
+#for cc in df.columns:
+#    print("%s: dtype=%s"%(cc, df[cc].dtype))
+
+
+# separate "Premiered" into "Premiere_year" and "Premiere_season"
+#print(df["Premiered"].unique())
+def yr_season(row):
+    if row["Premiered"]=="UNKNOWN":
+        row["Premiere_year"]="UNKNOWN"
+        row["Premiere_season"]="UNKNOWN"
+        return row
+    info = row["Premiered"].split(" ")
+    row["Premiere_year"] = int(info[1])
+    row["Premiere_season"] = info[0]
+    return row
+df = df.apply(yr_season, axis=1)
+#print(df.head())
+
+
+def refresh_data_csv(df):
+    df.to_csv("anime-dataset-2023-refined.csv")
+    df = pd.read_csv("anime-dataset-2023-refined.csv", index_col="anime_id")
+#    print(df.head())
+    return df
+
+# one-hot encode the genres
+#total_genres = list(set(", ".join(df["Genres"].unique()).split(", ")))
+#print(total_genres)
+total_genres = ['Adventure', 'Drama', 'Gourmet', 'Romance', 'Sports', 'Action', 'Suspense', 'Fantasy', 'Horror', 'Hentai', 'Supernatural', 'Avant Garde', 'Erotica', 'Mystery', 'Ecchi', 'Sci-Fi', 'Girls Love', 'UNKNOWN', 'Award Winning', 'Slice of Life', 'Boys Love', 'Comedy']
+df[["genre "+gg for gg in total_genres]] = 0
+#print(df.columns)
+
+def split_genre(row):
+    gs = row["Genres"].split(", ")
+    for gg in gs:
+        row["genre "+gg] = 1
+    return row
+
+df = df.apply(split_genre, axis=1)
+df = refresh_data_csv(df)
+
+#print(df.shape)
+genres = ['genre Adventure', 'genre Drama', 'genre Gourmet', 'genre Romance', 'genre Sports','genre Action', 'genre Suspense', 'genre Fantasy', 'genre Horror', 'genre Supernatural', 'genre Avant Garde', 'genre Mystery', 'genre Ecchi', 'genre Sci-Fi','genre Girls Love', 'genre Slice of Life', 'genre Boys Love', 'genre Comedy']
+#print(df.columns)
+by_year = df.groupby("Premiere_year").agg(mean_score=("Score","mean"), show_count=("Name","count"))
+by_year_genre = df.groupby("Premiere_year").agg({cc:"sum" for cc in genres})
+by_year_genre["genre_all_sum"] = by_year_genre.sum(axis=1)
+
+
+
+
+#by_season = df.groupby("Premiere_season")["Score"].mean()
+#print(by_season)
+cmap = "YlOrBr"
+
+by_year = pd.concat([by_year, by_year_genre],axis=1)
+fig1 = plt.figure()
+sns.heatmap(by_year[genres].transpose(),cmap=cmap)
+plt.title("Number of shows per genre per year")
+plt.gca().set_yticklabels([" ".join(gg.split(" ")[1:]) for gg in genres])
+plt.gca().set_xlabel("Premiere year")
+for gg in genres:
+    by_year[gg] = by_year[gg]/by_year["show_count"]
+#print(by_year)
+
+#sns.lineplot(by_year[["genre Sci-Fi",'genre Ecchi','genre Adventure']])
+fig2 = plt.figure()
+sns.heatmap(by_year[genres].transpose(),cmap=cmap)
+plt.gca().set_yticklabels([" ".join(gg.split(" ")[1:]) for gg in genres])
+plt.title("Proportion of each genre among all shows in each year")
+plt.gca().set_xlabel("Premiere year")
+plt.savefig("genre_in_years")
+
+x = df[['Name','Synopsis']]
+
+#print(x.isnull().sum())
+#print(x.head)
+
+
+# Exemple de dataframe df_detail
+# df_detail = pd.read_csv('path_to_your_data.csv')  # Lire les données depuis un fichier CSV
+
+# Nettoyage des données pour extraire les pays
+def extract_country(location):
+    if pd.isnull(location) or location == "NULL":
+        return "Unknown"
+    parts = location.split(', ')
+    if len(parts) == 1:
+        return parts[0]  # Si c'est juste un pays
+    else:
+        return parts[-1]  # Si c'est ville, pays
+
+df_detail['Country'] = df_detail['Location'].apply(extract_country)
+
+# Grouper par pays et compter le nombre d'utilisateurs
+by_country = df_detail['Country'].value_counts().reset_index()
+by_country.columns = ['Country', 'User_count']
+
+# Tri des pays par nombre d'utilisateurs
+by_country = by_country.sort_values(by='User_count', ascending=False)
+
+top_15_countries = by_country.head(15)[1:]
+
+# Affichage des résultats
+print(top_15_countries)
+
+# Visualisation
+fig, ax = plt.subplots(figsize=(12, 8))
+sns.barplot(x='User_count', y='Country', data=top_15_countries, palette="viridis", ax=ax)
+ax.set_title('Top 15 des pays regardant le plus d\'anime')
+ax.set_xlabel('Nombre d\'utilisateurs')
+ax.set_ylabel('Pays')
+plt.savefig("users_by_country")