Statistics/Data Mining Applications: Multi-stage Random Forest

Steps:
1. Conduct PCA (use both training and testing data to get PCA yields better results than only using training data, due to the fact that training set has 1000 observations and testing data has 9000 observations);
2. Apply random forest on the training set and scores it with cv, getting accuracy of 88.2%
3. Add the predicted probability as a new feature to both training set and testing set. Apply random forest on the training set and scores it with cv, getting accuracy of 95.6%
4. Repeat step 3 and get accuracy of 96.8%

Claassification Accuracy on the testing set: 94.78%

Data and Descriptions are here:
https://www.kaggle.com/c/data-science-london-scikit-learn

Python script:

import numpy as np

from sklearn import svm

from sklearn import decomposition

from sklearn import cross_validation as cv

from sklearn import grid_search as gs

from sklearn import metrics

import os

# change directory

os.chdir('C:\\Users\\n0233126\\Desktop\\Learning Materials\\Kaggle\\Data Science London')

X_data = np.genfromtxt(open('train.csv','rb'), delimiter=',')

y_data = np.genfromtxt(open('trainLabels.csv','rb'), delimiter=',')

test_data = np.genfromtxt(open('test.csv','rb'), delimiter=',')

# conduct PCA on the corpus of training & testing data

all_X_data = np.array(X_data.tolist()+test_data.tolist())

pca_try = decomposition.PCA()

pca_try.fit(all_X_data)

#plot(pca_try.explained_variance_ratio_) # 12 principal components

pca = decomposition.PCA(n_components=12, whiten=True)

all_X_data = pca.fit_transform(all_X_data)

X_data = pca.transform(X_data)

test_data = pca.transform(test_data)

# stage 1: fit on train_data and predict for test_data

clf=RandomForestClassifier(n_estimators=200, max_depth=None,

min_samples_split=10, min_samples_leaf=10,

random_state=0)

clf.fit(X_data,y_data)

# cross-validation score

scores = cv.cross_val_score(clf, X_data, y_data, cv=10)

print('Estimated score: %0.5f' % scores.mean())

# stage 2: RF - add the predicted probabiilty from stage 1 as new feature

X_data_pred_prob = clf.predict_proba(X_data)

X_data_pred_prob_class1 = [i[1] for i in X_data_pred_prob]

test_data_pred_prob = clf.predict_proba(test_data)

test_data_pred_prob_class1 = [i[1] for i in test_data_pred_prob]

X_data_new = X_data.tolist()

for i in range(len(X_data_new)):

X_data_new[i].append(X_data_pred_prob_class1[i])

X_data_new = np.array(X_data_new)

test_data_new = test_data.tolist()

for i in range(len(test_data_new)):

test_data_new[i].append(test_data_pred_prob_class1[i])

test_data_new = np.array(test_data_new)

clf_stage2=RandomForestClassifier(n_estimators=200, max_depth=None,

min_samples_split=10, min_samples_leaf=10,

random_state=0)

clf_stage2.fit(X_data_new,y_data)

# cross-validation score

scores_stage2 = cv.cross_val_score(clf_stage2, X_data_new, y_data, cv=10)

print('Estimated score: %0.5f' % scores_stage2.mean())

# stage 3: RF - add the predicted probabiilty from stage 2 as new feature

X_data_pred_prob_stage2 = clf_stage2.predict_proba(X_data_new)

X_data_pred_prob_class1_stage2 = [i[1] for i in X_data_pred_prob_stage2]

test_data_pred_prob_stage2 = clf_stage2.predict_proba(test_data_new)

test_data_pred_prob_class1_stage2 = [i[1] for i in test_data_pred_prob_stage2]

X_data_new2 = X_data_new.tolist()

for i in range(len(X_data_new2)):

X_data_new2[i].append(X_data_pred_prob_class1_stage2[i])

X_data_new2 = np.array(X_data_new2)

test_data_new2 = test_data_new.tolist()

for i in range(len(test_data_new2)):

test_data_new2[i].append(test_data_pred_prob_class1_stage2[i])

test_data_new2 = np.array(test_data_new2)

clf_stage3=RandomForestClassifier(n_estimators=200, max_depth=None,

min_samples_split=10, min_samples_leaf=10,

random_state=0)

clf_stage3.fit(X_data_new2,y_data)

# cross-validation score

scores_stage3 = cv.cross_val_score(clf_stage3, X_data_new2, y_data, cv=10)

print('Estimated score: %0.5f' % scores_stage3.mean())

Statistics/Data Mining Applications

Wednesday, January 1, 2014

Multi-stage Random Forest - Python SciKit

No comments:

Post a Comment