Add preprocessing file

628fe5b0 · Kubat · Kubat · e9744777 · 628fe5b0 · 628fe5b0
--- a/.gitignore
+++ b/.gitignore
+.ipynb_checkpoints/
+__pycache__/
--- a/_titanic/preprocessing.py
+++ b/_titanic/preprocessing.py
+import pandas as pd
+import numpy as np
+from sklearn import preprocessing
+import matplotlib.pyplot as plt
+from matplotlib import style
+style.use('fivethirtyeight')
+""" %matplotlib inline
+%pylab inline """
+
+
+def parse_model(X, name_Y, use_columns):
+    """
+    Parse the model in two dataframes : the feature dataframe
+    and the target dataframe.
+
+    Parameters
+    ----------
+    X: the dataframe containing the features and the target
+       Series
+
+    name_Y: the name of the target feature
+
+    use_columns: the features to use
+
+    Returns
+    -------
+    X: the dataframe with the use_columns
+
+    target: the target dataframe
+    """
+    if name_Y not in X.columns:
+        raise ValueError("target column"+ name_Y+" should belong to df")
+    target = X[name_Y]
+    X = X[use_columns]
+    return X, target
+
+def plot_hist(feature, label1, label2, array_label1, array_label2, bins=20):
+    """
+    Plots a histogram of the feature against two other features of labels one and two
+
+    Parameters
+    ----------
+    feature : the name of the feature against which to plot
+
+    label1, label2 : the name of the two features plotted
+
+    array_label1 : the dataframe for label=label1
+    
+    array_label2 : the dataframe for label=label2
+
+    bins: number of bins in the histogram
+
+    """
+    x1 = np.array(array_label1[feature].dropna())
+    x2 = np.array(array_label2[feature].dropna())
+    plt.hist([x1, x2], label=[label1, label2], bins=bins, color=['r', 'b'])
+    plt.legend(loc="upper left")
+    plt.title('Relative distribution of %s' %feature)
+    plt.show()
+
+def fill_with_median(df, feature, classes):
+    """
+    Fills null values of feature with the median of the class
+
+    Parameters
+    ----------
+    df : the dataframe studied
+
+    feature : the name of the feature to fill
+
+    classes : the feature taken in consideration to fill
+              the value
+    """
+    for i in range(len(df)):
+        X = df[feature][i]
+
+        if pd.isnull(X):
+            current_class = df[classes][i]
+            df[feature][i] = df[df[classes]==current_class][feature].median()
+
+
+def div_cat(cat, num_div):
+    """
+    Returns a list of a divided category. The category must
+    have numerical values. Each division of the category has
+    approximatevely the same numbers of values.
+
+    Parameters
+    ----------
+    cat : the series to divide
+
+    num_div : the number of divisions
+
+    Returns
+    -------
+    A list of number with each number corresponding to the
+    category division the initial value belonged to.
+    """
+    divided_cat = []
+    divisions = pd.qcut(cat,num_div,retbins=True, duplicates="drop")[1]
+    for i in range(len(cat)):
+        for j in range(0, num_div):
+            if cat[i] <= divisions[j+1]:
+                divided_cat.append(j+1)
+                break
+    return divided_cat