Wednesday, January 1, 2014

Multi-stage Random Forest - Python SciKit

Steps:
1. Conduct PCA (use both training and testing data to get PCA yields better results than only using training data, due to the fact that training set has 1000 observations and testing data has 9000 observations);
2. Apply random forest on the training set and scores it with cv, getting accuracy of 88.2%
3. Add the predicted probability as a new feature to both training set and testing set. Apply random forest on the training set and scores it with cv, getting accuracy of 95.6%
4. Repeat step 3 and get accuracy of 96.8%

Claassification Accuracy on the testing set: 94.78%

Data and Descriptions are here:
https://www.kaggle.com/c/data-science-london-scikit-learn









Python script:
 
import numpy as np
from sklearn import svm
from sklearn import decomposition
from sklearn import cross_validation as cv
from sklearn import grid_search as gs
from sklearn import metrics
import os
# change directory
os.chdir('C:\\Users\\n0233126\\Desktop\\Learning Materials\\Kaggle\\Data Science London')

X_data = np.genfromtxt(open('train.csv','rb'), delimiter=',')
y_data = np.genfromtxt(open('trainLabels.csv','rb'), delimiter=',')
test_data = np.genfromtxt(open('test.csv','rb'), delimiter=',')

# conduct PCA on the corpus of training & testing data
all_X_data = np.array(X_data.tolist()+test_data.tolist())
pca_try = decomposition.PCA()
pca_try.fit(all_X_data)
#plot(pca_try.explained_variance_ratio_) # 12 principal components

pca = decomposition.PCA(n_components=12, whiten=True)
all_X_data = pca.fit_transform(all_X_data)
X_data = pca.transform(X_data)
test_data = pca.transform(test_data)
 
# stage 1: fit on train_data and predict for test_data
clf=RandomForestClassifier(n_estimators=200, max_depth=None,
min_samples_split=10, min_samples_leaf=10,
random_state=0)
clf.fit(X_data,y_data)

# cross-validation score
scores = cv.cross_val_score(clf, X_data, y_data, cv=10)
print('Estimated score: %0.5f' % scores.mean())
 
# stage 2: RF - add the predicted probabiilty from stage 1 as new feature
X_data_pred_prob = clf.predict_proba(X_data)
X_data_pred_prob_class1 = [i[1] for i in X_data_pred_prob]
test_data_pred_prob = clf.predict_proba(test_data)
test_data_pred_prob_class1 = [i[1] for i in test_data_pred_prob]

X_data_new = X_data.tolist()
for i in range(len(X_data_new)):
X_data_new[i].append(X_data_pred_prob_class1[i])
X_data_new = np.array(X_data_new)

test_data_new = test_data.tolist()
for i in range(len(test_data_new)):
test_data_new[i].append(test_data_pred_prob_class1[i])
test_data_new = np.array(test_data_new)

clf_stage2=RandomForestClassifier(n_estimators=200, max_depth=None,
min_samples_split=10, min_samples_leaf=10,
random_state=0)
clf_stage2.fit(X_data_new,y_data)

# cross-validation score
scores_stage2 = cv.cross_val_score(clf_stage2, X_data_new, y_data, cv=10)
print('Estimated score: %0.5f' % scores_stage2.mean())


# stage 3: RF - add the predicted probabiilty from stage 2 as new feature
X_data_pred_prob_stage2 = clf_stage2.predict_proba(X_data_new)
X_data_pred_prob_class1_stage2 = [i[1] for i in X_data_pred_prob_stage2]
test_data_pred_prob_stage2 = clf_stage2.predict_proba(test_data_new)
test_data_pred_prob_class1_stage2 = [i[1] for i in test_data_pred_prob_stage2]

X_data_new2 = X_data_new.tolist()
for i in range(len(X_data_new2)):
X_data_new2[i].append(X_data_pred_prob_class1_stage2[i])
X_data_new2 = np.array(X_data_new2)

test_data_new2 = test_data_new.tolist()
for i in range(len(test_data_new2)):
test_data_new2[i].append(test_data_pred_prob_class1_stage2[i])
test_data_new2 = np.array(test_data_new2)
 
clf_stage3=RandomForestClassifier(n_estimators=200, max_depth=None,
min_samples_split=10, min_samples_leaf=10,
random_state=0)
clf_stage3.fit(X_data_new2,y_data)

# cross-validation score
scores_stage3 = cv.cross_val_score(clf_stage3, X_data_new2, y_data, cv=10)
print('Estimated score: %0.5f' % scores_stage3.mean())






Classification using Support Vector Machine (Python - SciKit)

Steps:
1. Conduct PCA (use both training and testing data to get PCA yields better results than only using training data, due to the fact that training set has 1000 observations and testing data has 9000 observations);
2. Apply SVM and cross-validation on the training data and make classification (with predicted probabilities). Parameters are estimated by grid search.
3. Combine training data and part of labeled testing data which has predicted proababilities >=0.9 or <=0.1 (i.e., the part of testing data that has high confidence with correct classifications) and re-train the model. (Can iterate several more times)

Claassification Accuracy: 95.5%

Data and Descriptions are here:
https://www.kaggle.com/c/data-science-london-scikit-learn


Python Script:
# use both train data and test data to get features
# stage 1: use training data to classify test data
# stage 2: use part of labeled test data and training data to re-train model

import numpy as np
from sklearn import svm
from sklearn import decomposition
from sklearn import cross_validation as cv
from sklearn import grid_search as gs
from sklearn import metrics
import os

# change directory
os.chdir('C:\\Users\\n0233126\\Desktop\\Kaggle_SciKit')
X_data = np.genfromtxt(open('train.csv','rb'), delimiter=',')
y_data = np.genfromtxt(open('trainLabels.csv','rb'), delimiter=',')
test_data = np.genfromtxt(open('test.csv','rb'), delimiter=',')

 
# conduct PCA on the corpus of training & testing data
all_X_data = np.array(X_data.tolist()+test_data.tolist())
pca_try = decomposition.PCA()
pca_try.fit(all_X_data)
#plot(pca_try.explained_variance_ratio_) # 12 principal components
pca = decomposition.PCA(n_components=12, whiten=True)
all_X_data = pca.fit_transform(all_X_data)
X_data = pca.transform(X_data)
test_data = pca.transform(test_data)


# stage 1: fit on train_data and predict for test_data
# grid search for optimal parameters
c_range = 10.0 ** np.arange(6,8,0.25)
gamma_range = 10.0 ** np.arange(-1,0,0.25)
params = [{'kernel': ['rbf'], 'gamma': gamma_range, 'C': c_range}]
cv_nfolds = cv.StratifiedKFold(y_data, n_folds=3)
clf = gs.GridSearchCV(svm.SVC(probability=True), params, cv=cv_nfolds)
clf.fit(X_data,y_data)
# print("The best classifier is: ",clf.best_estimator_)

# cross-validation score
scores = cv.cross_val_score(clf.best_estimator_, X_data, y_data, cv=10)
print('Estimated score: %0.5f (+/- %0.5f)' % (scores.mean(), scores.std()/2))
 
# classify on test data
output_stage1 = clf.predict(test_data)

 
# stage 2: use test_data which has predicted_prob_class1 >=0.9 or <=0.1
test_data_pred_prob = clf.predict_proba(test_data)
test_data_pred_prob_class1 = [i[1] for i in test_data_pred_prob]
confident_test_data_index = [i for i in range(len(test_data_pred_prob_class1))
if test_data_pred_prob_class1[i]<=0.1
or test_data_pred_prob_class1[i]>=0.9]
 
confident_test_data = test_data[confident_test_data_index]
confident_test_y_data = output_stage1[confident_test_data_index]

X_data_stage2 = np.array(X_data.tolist()+confident_test_data.tolist())
y_data_stage2 = np.array(y_data.tolist()+confident_test_y_data.tolist())

# PCA remains the same
# because PCA was based on the corpus of training & testing data)
c_range_stage2 = 10.0 ** np.arange(5,10,1)
gamma_range_stage2 = 10.0 ** np.arange(-2,0,0.5)
params_stage2 = [{'kernel': ['rbf'], 'gamma': gamma_range_stage2,
'C': c_range_stage2}]

cv_nfolds_stage2 = cv.StratifiedKFold(y_data_stage2, n_folds=3)
clf_stage2 = gs.GridSearchCV(svm.SVC(probability=True), params_stage2,
cv=cv_nfolds_stage2)
clf_stage2.fit(X_data_stage2,y_data_stage2)

 
scores_stage2 = cv.cross_val_score(clf_stage2.best_estimator_,
X_data_stage2,y_data_stage2, cv=5)

                                  
print('Estimated score: %0.5f (+/- %0.5f)' % (scores_stage2.mean(),
scores_stage2.std() / 2))

output_stage2 = clf_stage2.predict(test_data)
np.savetxt('output.csv', output_stage2, fmt='%d')