diff --git a/_titanic/modelisation.py b/_titanic/modelisation.py new file mode 100644 index 0000000000000000000000000000000000000000..7354f6b388a41c07982bd45645c9df13321c22ff --- /dev/null +++ b/_titanic/modelisation.py @@ -0,0 +1,134 @@ +import numpy as np +from sklearn.model_selection import cross_val_score, train_test_split +from sklearn.linear_model import LogisticRegression +from sklearn.metrics import classification_report +from sklearn.ensemble import RandomForestClassifier +from sklearn.feature_selection import RFE +from sklearn.model_selection import GridSearchCV + + + +def logmodel_prediction(X, y, test_size, random_state): + """ + Returns the score for the logistic regression of the + dataframe X and the target value Y. + + Parameters + ---------- + X : the dataframe to use in the logistic regression + + y : the target to reach in the logistic regression + + test_size : the size of the test dataframe + + random_state : controls the shuffling applied to the data before applying the split. + """ + X_train, X_test, y_train, y_test = train_test_split(X, + y, + test_size=test_size, + random_state=random_state) + logmodel = LogisticRegression() + logmodel.fit(X_train, y_train) + predictions = logmodel.predict(X_test) + print(classification_report(y_test, predictions)) + print("score :",logmodel.score(X_test, y_test)) + +def random_forest_prediction(X, y, test_size, random_state1, random_state2): + """ + Returns the score for the random forest prediction of the + dataframe X and the target value Y. + + Parameters + ---------- + X : the dataframe to use in the random forest prediction + + y : the target to reach in the random forest prediction + + test_size : the size of the test dataframe + + random_state1 : controls the shuffling applied to the data before applying the split. + + random_state2 : controles the shuffling applied to the data in the random forest + classifier + + """ + X_train, X_test, y_train, y_test = train_test_split(X, + y, + test_size=test_size, + random_state=random_state1) + est = RandomForestClassifier(random_state=random_state2) + est.fit(X_train, y_train) + predictions = est.predict(X_test) + print(classification_report(y_test, predictions)) + print("score: ",est.score(X_test, y_test)) + + +def RFE_predicion(X, y, test_size, random_state, n_feature_to_select, verbose=1): + """ + Returns the score for the random forest prediction using recursive feature + elimination on the dataframe X and the target value Y. + + Parameters + ---------- + X : the dataframe to use in the random forest prediction + + y : the target to reach in the random forest prediction + + test_size : the size of the test dataframe + + random_state : controls the shuffling applied to the data before applying the split. + + n_feature_to_select : the number of feature to select + + """ + X_train, X_test, y_train, y_test = train_test_split(X, + y, + test_size=test_size, + random_state=random_state) + Estimator = RandomForestClassifier(random_state=42) + featSelect = RFE(estimator=Estimator, + n_features_to_select=n_feature_to_select, + verbose=verbose) + featSelect.fit(X_train, y_train) + predictions = featSelect.predict(X_test) + print(classification_report(y_test, predictions)) + print("score: ",featSelect.score(X_test, y_test)) + + + +def GSCV_prediction(X, y, test_size, random_state): + """ + Returns the score for the random forest prediction using the Grid Search CV method + on the dataframe X and the target value Y. + + Parameters + ---------- + X : the dataframe to use in the random forest prediction + + y : the target to reach in the random forest prediction + + test_size : the size of the test dataframe + + random_state : controls the shuffling applied to the data before applying the split. + + """ + X_train, X_test, y_train, y_test = train_test_split(X, + y, + test_size=test_size, + random_state=random_state) + Estimator = RandomForestClassifier(random_state=42) + parameters = { + 'n_estimators': [50,100,150,200,250,300], + 'max_depth': np.arange(6,16,2), + 'min_samples_split': np.arange(10,30,5), + 'min_samples_leaf': np.arange(5,20,5) + } + gd_sr2 = GridSearchCV(estimator=Estimator, + param_grid=parameters, + cv=5, + n_jobs=-1) + gd_sr2.fit(X_train, y_train) + predictions = gd_sr2.predict(X_test) + print("Parameters chosen: ", gd_sr2.best_params_) + print(classification_report(y_test, predictions)) + print("score: ",gd_sr2.score(X_test, y_test)) \ No newline at end of file