Skip to content
Extraits de code Groupes Projets
Valider 628fe5b0 rédigé par Kubat's avatar Kubat Validation de Kubat
Parcourir les fichiers

Add preprocessing file

parent e9744777
Aucune branche associée trouvée
Aucune étiquette associée trouvée
Aucune requête de fusion associée trouvée
.ipynb_checkpoints/
__pycache__/
import pandas as pd
import numpy as np
from sklearn import preprocessing
import matplotlib.pyplot as plt
from matplotlib import style
style.use('fivethirtyeight')
""" %matplotlib inline
%pylab inline """
def parse_model(X, name_Y, use_columns):
"""
Parse the model in two dataframes : the feature dataframe
and the target dataframe.
Parameters
----------
X: the dataframe containing the features and the target
Series
name_Y: the name of the target feature
use_columns: the features to use
Returns
-------
X: the dataframe with the use_columns
target: the target dataframe
"""
if name_Y not in X.columns:
raise ValueError("target column"+ name_Y+" should belong to df")
target = X[name_Y]
X = X[use_columns]
return X, target
def plot_hist(feature, label1, label2, array_label1, array_label2, bins=20):
"""
Plots a histogram of the feature against two other features of labels one and two
Parameters
----------
feature : the name of the feature against which to plot
label1, label2 : the name of the two features plotted
array_label1 : the dataframe for label=label1
array_label2 : the dataframe for label=label2
bins: number of bins in the histogram
"""
x1 = np.array(array_label1[feature].dropna())
x2 = np.array(array_label2[feature].dropna())
plt.hist([x1, x2], label=[label1, label2], bins=bins, color=['r', 'b'])
plt.legend(loc="upper left")
plt.title('Relative distribution of %s' %feature)
plt.show()
def fill_with_median(df, feature, classes):
"""
Fills null values of feature with the median of the class
Parameters
----------
df : the dataframe studied
feature : the name of the feature to fill
classes : the feature taken in consideration to fill
the value
"""
for i in range(len(df)):
X = df[feature][i]
if pd.isnull(X):
current_class = df[classes][i]
df[feature][i] = df[df[classes]==current_class][feature].median()
def div_cat(cat, num_div):
"""
Returns a list of a divided category. The category must
have numerical values. Each division of the category has
approximatevely the same numbers of values.
Parameters
----------
cat : the series to divide
num_div : the number of divisions
Returns
-------
A list of number with each number corresponding to the
category division the initial value belonged to.
"""
divided_cat = []
divisions = pd.qcut(cat,num_div,retbins=True, duplicates="drop")[1]
for i in range(len(cat)):
for j in range(0, num_div):
if cat[i] <= divisions[j+1]:
divided_cat.append(j+1)
break
return divided_cat
0% Chargement en cours ou .
You are about to add 0 people to the discussion. Proceed with caution.
Veuillez vous inscrire ou vous pour commenter