diff --git a/rapport/images/genre_in_years.png b/rapport/images/genre_in_years.png new file mode 100644 index 0000000000000000000000000000000000000000..f0a7383abebd349164e41ff0b28ad17e5fbbe382 Binary files /dev/null and b/rapport/images/genre_in_years.png differ diff --git a/rapport/images/users_by_country.png b/rapport/images/users_by_country.png new file mode 100644 index 0000000000000000000000000000000000000000..4822c9c47af31f1e8f25f9326b4f157fa4832290 Binary files /dev/null and b/rapport/images/users_by_country.png differ diff --git a/treat.py b/treat.py new file mode 100755 index 0000000000000000000000000000000000000000..9089254638ba504b1bbb62d71d5e5a76a7153f4a --- /dev/null +++ b/treat.py @@ -0,0 +1,132 @@ +import pandas as pd +import matplotlib.pyplot as plt +import seaborn as sns + +df = pd.read_csv('files/dataset/anime-dataset-2023.csv') +df_user_filtered = pd.read_csv('files/dataset/user-filtered.csv') +df_detail = pd.read_csv('files/dataset/users-details-2023.csv') + + + +# convert numeric columns to numeric. They became "objects" because they contained "UNKNOWN". That's stupid... assign -1 to "UNKNOWN"s +#print(df.columns) + +float_col = ["Score"] +int_col = ["Popularity","Favorites", "Members","Rank","Episodes","Scored By"] +df[float_col+int_col] = df[float_col+int_col].replace("UNKNOWN","-1") +df[float_col+int_col] = df[float_col+int_col].astype("float64") +df[int_col] = df[int_col].astype("int64") + +#for cc in df.columns: +# print("%s: dtype=%s"%(cc, df[cc].dtype)) + + +# separate "Premiered" into "Premiere_year" and "Premiere_season" +#print(df["Premiered"].unique()) +def yr_season(row): + if row["Premiered"]=="UNKNOWN": + row["Premiere_year"]="UNKNOWN" + row["Premiere_season"]="UNKNOWN" + return row + info = row["Premiered"].split(" ") + row["Premiere_year"] = int(info[1]) + row["Premiere_season"] = info[0] + return row +df = df.apply(yr_season, axis=1) +#print(df.head()) + + +def refresh_data_csv(df): + df.to_csv("anime-dataset-2023-refined.csv") + df = pd.read_csv("anime-dataset-2023-refined.csv", index_col="anime_id") +# print(df.head()) + return df + +# one-hot encode the genres +#total_genres = list(set(", ".join(df["Genres"].unique()).split(", "))) +#print(total_genres) +total_genres = ['Adventure', 'Drama', 'Gourmet', 'Romance', 'Sports', 'Action', 'Suspense', 'Fantasy', 'Horror', 'Hentai', 'Supernatural', 'Avant Garde', 'Erotica', 'Mystery', 'Ecchi', 'Sci-Fi', 'Girls Love', 'UNKNOWN', 'Award Winning', 'Slice of Life', 'Boys Love', 'Comedy'] +df[["genre "+gg for gg in total_genres]] = 0 +#print(df.columns) + +def split_genre(row): + gs = row["Genres"].split(", ") + for gg in gs: + row["genre "+gg] = 1 + return row + +df = df.apply(split_genre, axis=1) +df = refresh_data_csv(df) + +#print(df.shape) +genres = ['genre Adventure', 'genre Drama', 'genre Gourmet', 'genre Romance', 'genre Sports','genre Action', 'genre Suspense', 'genre Fantasy', 'genre Horror', 'genre Supernatural', 'genre Avant Garde', 'genre Mystery', 'genre Ecchi', 'genre Sci-Fi','genre Girls Love', 'genre Slice of Life', 'genre Boys Love', 'genre Comedy'] +#print(df.columns) +by_year = df.groupby("Premiere_year").agg(mean_score=("Score","mean"), show_count=("Name","count")) +by_year_genre = df.groupby("Premiere_year").agg({cc:"sum" for cc in genres}) +by_year_genre["genre_all_sum"] = by_year_genre.sum(axis=1) + + + + +#by_season = df.groupby("Premiere_season")["Score"].mean() +#print(by_season) +cmap = "YlOrBr" + +by_year = pd.concat([by_year, by_year_genre],axis=1) +fig1 = plt.figure() +sns.heatmap(by_year[genres].transpose(),cmap=cmap) +plt.title("Number of shows per genre per year") +plt.gca().set_yticklabels([" ".join(gg.split(" ")[1:]) for gg in genres]) +plt.gca().set_xlabel("Premiere year") +for gg in genres: + by_year[gg] = by_year[gg]/by_year["show_count"] +#print(by_year) + +#sns.lineplot(by_year[["genre Sci-Fi",'genre Ecchi','genre Adventure']]) +fig2 = plt.figure() +sns.heatmap(by_year[genres].transpose(),cmap=cmap) +plt.gca().set_yticklabels([" ".join(gg.split(" ")[1:]) for gg in genres]) +plt.title("Proportion of each genre among all shows in each year") +plt.gca().set_xlabel("Premiere year") +plt.savefig("genre_in_years") + +x = df[['Name','Synopsis']] + +#print(x.isnull().sum()) +#print(x.head) + + +# Exemple de dataframe df_detail +# df_detail = pd.read_csv('path_to_your_data.csv') # Lire les données depuis un fichier CSV + +# Nettoyage des données pour extraire les pays +def extract_country(location): + if pd.isnull(location) or location == "NULL": + return "Unknown" + parts = location.split(', ') + if len(parts) == 1: + return parts[0] # Si c'est juste un pays + else: + return parts[-1] # Si c'est ville, pays + +df_detail['Country'] = df_detail['Location'].apply(extract_country) + +# Grouper par pays et compter le nombre d'utilisateurs +by_country = df_detail['Country'].value_counts().reset_index() +by_country.columns = ['Country', 'User_count'] + +# Tri des pays par nombre d'utilisateurs +by_country = by_country.sort_values(by='User_count', ascending=False) + +top_15_countries = by_country.head(15)[1:] + +# Affichage des résultats +print(top_15_countries) + +# Visualisation +fig, ax = plt.subplots(figsize=(12, 8)) +sns.barplot(x='User_count', y='Country', data=top_15_countries, palette="viridis", ax=ax) +ax.set_title('Top 15 des pays regardant le plus d\'anime') +ax.set_xlabel('Nombre d\'utilisateurs') +ax.set_ylabel('Pays') +plt.savefig("users_by_country")