Adding the modelisations

37c1c035 · William RUFFINE · Kubat · 628fe5b0 · 37c1c035
--- a/_titanic/modelisation.py
+++ b/_titanic/modelisation.py
+import numpy as np
+from sklearn.model_selection import cross_val_score, train_test_split
+from sklearn.linear_model import LogisticRegression
+from sklearn.metrics import classification_report
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.feature_selection import RFE
+from sklearn.model_selection import GridSearchCV
+
+
+
+def logmodel_prediction(X, y, test_size, random_state):
+    """
+    Returns the score for the logistic regression of the
+    dataframe X and the target value Y.
+
+    Parameters
+    ----------
+    X : the dataframe to use in the logistic regression
+
+    y : the target to reach in the logistic regression
+
+    test_size : the size of the test dataframe
+
+    random_state : controls the shuffling applied to the data before applying the split. 
+    """
+    X_train, X_test, y_train, y_test = train_test_split(X, 
+                                                        y, 
+                                                        test_size=test_size,
+                                                        random_state=random_state)
+    logmodel = LogisticRegression()
+    logmodel.fit(X_train, y_train)
+    predictions = logmodel.predict(X_test)
+    print(classification_report(y_test, predictions))
+    print("score :",logmodel.score(X_test, y_test))
+
+def random_forest_prediction(X, y, test_size, random_state1, random_state2):
+    """
+    Returns the score for the random forest prediction of the
+    dataframe X and the target value Y.
+
+    Parameters
+    ----------
+    X : the dataframe to use in the random forest prediction
+
+    y : the target to reach in the random forest prediction
+
+    test_size : the size of the test dataframe
+
+    random_state1 : controls the shuffling applied to the data before applying the split. 
+
+    random_state2 : controles the shuffling applied to the data in the random forest
+                    classifier
+
+    """
+    X_train, X_test, y_train, y_test = train_test_split(X, 
+                                                        y, 
+                                                        test_size=test_size,
+                                                        random_state=random_state1)
+    est = RandomForestClassifier(random_state=random_state2)
+    est.fit(X_train, y_train)
+    predictions = est.predict(X_test)
+    print(classification_report(y_test, predictions))
+    print("score: ",est.score(X_test, y_test))
+
+
+def RFE_predicion(X, y, test_size, random_state, n_feature_to_select, verbose=1):
+    """
+    Returns the score for the random forest prediction using recursive feature
+    elimination on the dataframe X and the target value Y.
+
+    Parameters
+    ----------
+    X : the dataframe to use in the random forest prediction
+
+    y : the target to reach in the random forest prediction
+
+    test_size : the size of the test dataframe
+
+    random_state : controls the shuffling applied to the data before applying the split. 
+
+    n_feature_to_select : the number of feature to select
+
+    """
+    X_train, X_test, y_train, y_test = train_test_split(X, 
+                                                    y, 
+                                                    test_size=test_size,
+                                                    random_state=random_state)
+    Estimator = RandomForestClassifier(random_state=42)
+    featSelect = RFE(estimator=Estimator,
+                    n_features_to_select=n_feature_to_select,
+                    verbose=verbose)
+    featSelect.fit(X_train, y_train)
+    predictions = featSelect.predict(X_test)
+    print(classification_report(y_test, predictions))
+    print("score: ",featSelect.score(X_test, y_test))
+
+
+
+def GSCV_prediction(X, y, test_size, random_state):
+    """
+    Returns the score for the random forest prediction using the Grid Search CV method
+    on the dataframe X and the target value Y.
+
+    Parameters
+    ----------
+    X : the dataframe to use in the random forest prediction
+
+    y : the target to reach in the random forest prediction
+
+    test_size : the size of the test dataframe
+
+    random_state : controls the shuffling applied to the data before applying the split. 
+
+    """
+    X_train, X_test, y_train, y_test = train_test_split(X, 
+                                                        y, 
+                                                        test_size=test_size,
+                                                        random_state=random_state)
+    Estimator = RandomForestClassifier(random_state=42)
+    parameters = {
+    'n_estimators': [50,100,150,200,250,300],
+    'max_depth': np.arange(6,16,2),
+    'min_samples_split': np.arange(10,30,5),
+    'min_samples_leaf': np.arange(5,20,5)
+    }
+    gd_sr2 = GridSearchCV(estimator=Estimator,
+                          param_grid=parameters,
+                          cv=5,
+                          n_jobs=-1)
+    gd_sr2.fit(X_train, y_train)
+    predictions = gd_sr2.predict(X_test)
+    print("Parameters chosen: ", gd_sr2.best_params_)
+    print(classification_report(y_test, predictions))
+    print("score: ",gd_sr2.score(X_test, y_test))
\ No newline at end of file