Sunday, April 27, 2014

Simple Python Problems

a=[1,2,3,0]
b=[0,5,4,6,7]

Sort the union of two sets:
sorted(unique(a+b))                           =>  [0, 1, 2, 3, 4, 5, 6, 7]
sorted(unique(a+b),reverse=True)     => [7, 6, 5, 4, 3, 2, 1, 0]

Wednesday, January 1, 2014

Multi-stage Random Forest - Python SciKit

Steps:
1. Conduct PCA (use both training and testing data to get PCA yields better results than only using training data, due to the fact that training set has 1000 observations and testing data has 9000 observations);
2. Apply random forest on the training set and scores it with cv, getting accuracy of 88.2%
3. Add the predicted probability as a new feature to both training set and testing set. Apply random forest on the training set and scores it with cv, getting accuracy of 95.6%
4. Repeat step 3 and get accuracy of 96.8%

Claassification Accuracy on the testing set: 94.78%

Data and Descriptions are here:
https://www.kaggle.com/c/data-science-london-scikit-learn









Python script:
 
import numpy as np
from sklearn import svm
from sklearn import decomposition
from sklearn import cross_validation as cv
from sklearn import grid_search as gs
from sklearn import metrics
import os
# change directory
os.chdir('C:\\Users\\n0233126\\Desktop\\Learning Materials\\Kaggle\\Data Science London')

X_data = np.genfromtxt(open('train.csv','rb'), delimiter=',')
y_data = np.genfromtxt(open('trainLabels.csv','rb'), delimiter=',')
test_data = np.genfromtxt(open('test.csv','rb'), delimiter=',')

# conduct PCA on the corpus of training & testing data
all_X_data = np.array(X_data.tolist()+test_data.tolist())
pca_try = decomposition.PCA()
pca_try.fit(all_X_data)
#plot(pca_try.explained_variance_ratio_) # 12 principal components

pca = decomposition.PCA(n_components=12, whiten=True)
all_X_data = pca.fit_transform(all_X_data)
X_data = pca.transform(X_data)
test_data = pca.transform(test_data)
 
# stage 1: fit on train_data and predict for test_data
clf=RandomForestClassifier(n_estimators=200, max_depth=None,
min_samples_split=10, min_samples_leaf=10,
random_state=0)
clf.fit(X_data,y_data)

# cross-validation score
scores = cv.cross_val_score(clf, X_data, y_data, cv=10)
print('Estimated score: %0.5f' % scores.mean())
 
# stage 2: RF - add the predicted probabiilty from stage 1 as new feature
X_data_pred_prob = clf.predict_proba(X_data)
X_data_pred_prob_class1 = [i[1] for i in X_data_pred_prob]
test_data_pred_prob = clf.predict_proba(test_data)
test_data_pred_prob_class1 = [i[1] for i in test_data_pred_prob]

X_data_new = X_data.tolist()
for i in range(len(X_data_new)):
X_data_new[i].append(X_data_pred_prob_class1[i])
X_data_new = np.array(X_data_new)

test_data_new = test_data.tolist()
for i in range(len(test_data_new)):
test_data_new[i].append(test_data_pred_prob_class1[i])
test_data_new = np.array(test_data_new)

clf_stage2=RandomForestClassifier(n_estimators=200, max_depth=None,
min_samples_split=10, min_samples_leaf=10,
random_state=0)
clf_stage2.fit(X_data_new,y_data)

# cross-validation score
scores_stage2 = cv.cross_val_score(clf_stage2, X_data_new, y_data, cv=10)
print('Estimated score: %0.5f' % scores_stage2.mean())


# stage 3: RF - add the predicted probabiilty from stage 2 as new feature
X_data_pred_prob_stage2 = clf_stage2.predict_proba(X_data_new)
X_data_pred_prob_class1_stage2 = [i[1] for i in X_data_pred_prob_stage2]
test_data_pred_prob_stage2 = clf_stage2.predict_proba(test_data_new)
test_data_pred_prob_class1_stage2 = [i[1] for i in test_data_pred_prob_stage2]

X_data_new2 = X_data_new.tolist()
for i in range(len(X_data_new2)):
X_data_new2[i].append(X_data_pred_prob_class1_stage2[i])
X_data_new2 = np.array(X_data_new2)

test_data_new2 = test_data_new.tolist()
for i in range(len(test_data_new2)):
test_data_new2[i].append(test_data_pred_prob_class1_stage2[i])
test_data_new2 = np.array(test_data_new2)
 
clf_stage3=RandomForestClassifier(n_estimators=200, max_depth=None,
min_samples_split=10, min_samples_leaf=10,
random_state=0)
clf_stage3.fit(X_data_new2,y_data)

# cross-validation score
scores_stage3 = cv.cross_val_score(clf_stage3, X_data_new2, y_data, cv=10)
print('Estimated score: %0.5f' % scores_stage3.mean())






Classification using Support Vector Machine (Python - SciKit)

Steps:
1. Conduct PCA (use both training and testing data to get PCA yields better results than only using training data, due to the fact that training set has 1000 observations and testing data has 9000 observations);
2. Apply SVM and cross-validation on the training data and make classification (with predicted probabilities). Parameters are estimated by grid search.
3. Combine training data and part of labeled testing data which has predicted proababilities >=0.9 or <=0.1 (i.e., the part of testing data that has high confidence with correct classifications) and re-train the model. (Can iterate several more times)

Claassification Accuracy: 95.5%

Data and Descriptions are here:
https://www.kaggle.com/c/data-science-london-scikit-learn


Python Script:
# use both train data and test data to get features
# stage 1: use training data to classify test data
# stage 2: use part of labeled test data and training data to re-train model

import numpy as np
from sklearn import svm
from sklearn import decomposition
from sklearn import cross_validation as cv
from sklearn import grid_search as gs
from sklearn import metrics
import os

# change directory
os.chdir('C:\\Users\\n0233126\\Desktop\\Kaggle_SciKit')
X_data = np.genfromtxt(open('train.csv','rb'), delimiter=',')
y_data = np.genfromtxt(open('trainLabels.csv','rb'), delimiter=',')
test_data = np.genfromtxt(open('test.csv','rb'), delimiter=',')

 
# conduct PCA on the corpus of training & testing data
all_X_data = np.array(X_data.tolist()+test_data.tolist())
pca_try = decomposition.PCA()
pca_try.fit(all_X_data)
#plot(pca_try.explained_variance_ratio_) # 12 principal components
pca = decomposition.PCA(n_components=12, whiten=True)
all_X_data = pca.fit_transform(all_X_data)
X_data = pca.transform(X_data)
test_data = pca.transform(test_data)


# stage 1: fit on train_data and predict for test_data
# grid search for optimal parameters
c_range = 10.0 ** np.arange(6,8,0.25)
gamma_range = 10.0 ** np.arange(-1,0,0.25)
params = [{'kernel': ['rbf'], 'gamma': gamma_range, 'C': c_range}]
cv_nfolds = cv.StratifiedKFold(y_data, n_folds=3)
clf = gs.GridSearchCV(svm.SVC(probability=True), params, cv=cv_nfolds)
clf.fit(X_data,y_data)
# print("The best classifier is: ",clf.best_estimator_)

# cross-validation score
scores = cv.cross_val_score(clf.best_estimator_, X_data, y_data, cv=10)
print('Estimated score: %0.5f (+/- %0.5f)' % (scores.mean(), scores.std()/2))
 
# classify on test data
output_stage1 = clf.predict(test_data)

 
# stage 2: use test_data which has predicted_prob_class1 >=0.9 or <=0.1
test_data_pred_prob = clf.predict_proba(test_data)
test_data_pred_prob_class1 = [i[1] for i in test_data_pred_prob]
confident_test_data_index = [i for i in range(len(test_data_pred_prob_class1))
if test_data_pred_prob_class1[i]<=0.1
or test_data_pred_prob_class1[i]>=0.9]
 
confident_test_data = test_data[confident_test_data_index]
confident_test_y_data = output_stage1[confident_test_data_index]

X_data_stage2 = np.array(X_data.tolist()+confident_test_data.tolist())
y_data_stage2 = np.array(y_data.tolist()+confident_test_y_data.tolist())

# PCA remains the same
# because PCA was based on the corpus of training & testing data)
c_range_stage2 = 10.0 ** np.arange(5,10,1)
gamma_range_stage2 = 10.0 ** np.arange(-2,0,0.5)
params_stage2 = [{'kernel': ['rbf'], 'gamma': gamma_range_stage2,
'C': c_range_stage2}]

cv_nfolds_stage2 = cv.StratifiedKFold(y_data_stage2, n_folds=3)
clf_stage2 = gs.GridSearchCV(svm.SVC(probability=True), params_stage2,
cv=cv_nfolds_stage2)
clf_stage2.fit(X_data_stage2,y_data_stage2)

 
scores_stage2 = cv.cross_val_score(clf_stage2.best_estimator_,
X_data_stage2,y_data_stage2, cv=5)

                                  
print('Estimated score: %0.5f (+/- %0.5f)' % (scores_stage2.mean(),
scores_stage2.std() / 2))

output_stage2 = clf_stage2.predict(test_data)
np.savetxt('output.csv', output_stage2, fmt='%d')


Saturday, November 24, 2012

[Python] Traffic Prediction for advertisers

import simplejson as json
from math import *
import numpy
import os
from pandas import DataFrame
from pandas.tools.merge import merge
import random
import re

USE_NUMPY = True
SAMPLE_PROBABILITY = 0.2   # sampling for 20% of the records to simulate
SIMULATION_ITERATIONS = 20  # simulation times
MINIMUM_IMPRESSIONS = 500
ALPHA = 1
BETA = 10

# specify the buyer_campaign
BUYER_CAMPAIGN_ID = 201

DATA_PATH = os.environ['DATA_PATH'] if 'DATA_PATH' in os.environ else '.'

# Prerequisite input files, needed by this program:

# Example:
# {"status": "active", "title": "Pork Rinds Nutrition Information",
#  "url": "http://www.livestrong.com/article/263272-pork-rinds-nutr...",
#  "site_id": 3, "campaign_id": 4,
#  "image_url": "http://livestrong.com/media/images/category-images/...jpg",
#  "id": 15949, "__key__": "Rad:00f70ed87c579203b4521627204912ffb32c29ef..."}
RAD_FILE = DATA_PATH + '/rad.txt'

# Example:
# {"status": "active", "account_id": 3,
#  "allowed_countries": ["KR", "SY", "IR", "CU", "SD"], "site_id": 3,
#  "date_end": null, "date_start": null, "internal_tracking": "",
#  "budget": null, "id": 3, "external_tracking": "",
#  "allowed_countries_is_whitelist": false, "link_attribution": "",
#  "bid": null, "__key__": "Campaign:3", "name": "LS Top 10,000"}
CAMPAIGN_FILE = DATA_PATH + '/campaign.txt'

# Huge file, example:
# {"free_radids_to_attributes": {
#    "196771": {"rad_impressions": 499, "rad_clicks": 0, "is_editorial": null},
#    "191140": {"rad_impressions": 38, "rad_clicks": 26, "is_editorial": null},
#    ...
#  "paid_radids_to_attributes": {
#    ... }
#  "url": "http://www.ehow.com/12345-some-silly-document.html"}
# TODO(ming|kevinx): document on how to retrieve rankings from Tracker
# For faster run:
# cat radrankings | head -1000 > sample_radrankings.txt
RADRANKINGS_FILE = DATA_PATH + '/sample_radrankings.txt'

# campaign_id, (seller) site_id,  content_unit, position, clicks
# 3       3       2       4       99
# 4       3       2       3       261
PIG_CLICK_COUNT_FILE = DATA_PATH + '/click_count.txt'

# campaign_id, (seller) site_id, content_unit, position, imps
# 3       3       2       5       57086
# 7       4       27      0       62846
PIG_IMP_COUNT_FILE = DATA_PATH + '/imp_count.txt'

# site_id, host_url, cu_imps
# 3       http://www.livestrong.com/article/100261-pepsin/        63
# 3       http://www.livestrong.com/article/24522-tonsils/        11
PIG_HOST_URL_IMPS_FILE = DATA_PATH + '/host_url_imps.txt'


missing_files = []
for file in (RAD_FILE, CAMPAIGN_FILE, RADRANKINGS_FILE,
             PIG_CLICK_COUNT_FILE, PIG_IMP_COUNT_FILE,
             PIG_HOST_URL_IMPS_FILE):
    if not os.path.exists(file):
        missing_files.append(file)
if missing_files:
    raise RuntimeError("Unable to find file(s):%s" % missing_files)

# TODO(kevinx): move these to testable subroutines


# PART 1:
# Return the list of buyer_rads_id for a given target seller site & campaign
buyer_rads_file = DATA_PATH + '/_buyer_rads.txt'
print '1) %s -> %s' % (RAD_FILE, buyer_rads_file)


def get_all_radids_from_campaignid(campaign_id=BUYER_CAMPAIGN_ID):
    """
    Find all the unique ids that have the campaign_id
    """
    rad_set = set()
    num_processed_lines_part1 = 0
    for content in open(RAD_FILE):
        num_processed_lines_part1 += 1
        if num_processed_lines_part1 % 500000 == 0:
            print('num_processed_lines_part1=%s' % num_processed_lines_part1)
        record = json.loads(content)
        if record['campaign_id'] == campaign_id:
            rad_set.add(str(record['id']))
    return rad_set   # to make a unique set of rads_id

rad_id_list = list(get_all_radids_from_campaignid())
buyer_rads_fw = open(buyer_rads_file, 'w')
buyer_rads_fw.write(json.dumps(rad_id_list))
buyer_rads_fw.close()


# Part 2: create a list of radranking records that the buyer_rads
# created in part 1 are eligible to show

## need to explore all sellers in the end, but since each seller has different
## frame set-ups and ctr, need to simulate based on each seller
## and then aggregate
domain_to_remain = 'http://www.livestrong.com'  # specify the seller

radrankings_eligible_unique_file = (
    DATA_PATH + '/_radrankings_eligible_unique.txt')
print '2) %s -> %s' % (RADRANKINGS_FILE,
                       radrankings_eligible_unique_file)
num_processed_lines_part2 = 0
# convert it to list (otherwise it would be a string)
buyer_rad_ids = json.loads((open(buyer_rads_file)).read())

radrankings_eligible_fw = open(radrankings_eligible_unique_file, 'w')
# TODO(kevinx): allow bzip2 and gz files as input directly
url_seen = set()
duplicate_url_count = 0
for content in open(RADRANKINGS_FILE):
    num_processed_lines_part2 += 1
    if num_processed_lines_part2 % 100000 == 0:
        print 'num_processed_lines_part2={}'.format(num_processed_lines_part2)

    record = json.loads(content)
    url = record['url']
    if url in url_seen:
        duplicate_url_count += 1
        continue

    if (not url.startswith(domain_to_remain) or
            not record['paid_radids_to_attributes']):
        continue

    url_seen.add(url)
    radid_to_attributes = record['paid_radids_to_attributes']
    rad_id_pool = []
    rad_impclick = []
    for rad_id in radid_to_attributes.keys():
        rad_id_pool.append(str(rad_id))

    if len(set(buyer_rad_ids).intersection(set(rad_id_pool))) > 0:
        radrankings_eligible_fw.write(content)
radrankings_eligible_fw.close()
if duplicate_url_count:
    print("WARNING: Encountered %d duplicate URLs in %s" %
          (duplicate_url_count, RADRANKINGS_FILE))


# part 3: for each record generated in radrankings_eligible_unique,
# generate corresponding campaign_id and current bidding price
# for each eligible rad

## map campaign id to latest bid
campaignid_to_bid = {}
for content in open(CAMPAIGN_FILE):
    record = json.loads(content)
    campaign_id = record['id']  # this is an integer
    bid = record['bid']
    if record['status'] == 'active' and bid > 0:
        campaignid_to_bid[campaign_id] = bid
    else:
        campaignid_to_bid[campaign_id] = 0

## some campaign_id may not exist now but exists in the file rad.txt
for iter in range(max(campaignid_to_bid.keys()) + 100):
    if iter not in campaignid_to_bid.keys():
        campaignid_to_bid[iter] = 0

# defined in part1
BUYER_CAMPAIGN_ID = 201
campaignid_to_bid[BUYER_CAMPAIGN_ID] = 0.05   # campaign new bid

radrankings_with_bid_file = DATA_PATH + '/_radrankings_with_bid.txt'
print('3) %s -> %s' %
      (radrankings_eligible_unique_file, radrankings_with_bid_file))

rad_info = {}  # map rad_id to (...)
for content in open(RAD_FILE):
    record = json.loads(content)
    rad_id = record["id"]
    site_id = record["site_id"]
    campaign_id = record["campaign_id"]
    rad_info[rad_id] = (site_id, campaign_id)

# now generate a file that contains url and paid rad ids with
# with tuples [impressions, clicks, buyer_site_id, buyer_campaign_id, bid]
radrankings_with_bid_fw = open(radrankings_with_bid_file, 'w')
num_processed_lines_part3 = 0
for content in open(radrankings_eligible_unique_file):
    num_processed_lines_part3 += 1
    if num_processed_lines_part3 % 10000 == 0:
        print 'num_processed_lines_part3={}'.format(num_processed_lines_part3)

    record = json.loads(content)
    radranking_pool = {
        'paid_radid_to_attributes': {},
        'url': str(record['url'])
    }
    #radranking_pool["url"] = str(url)
    #radranking_pool["paid"] = {}

    # generate the rads that are eligible to show for each host_url.
    # each rad includes the info:imp,click,buyer_site_id,buyer_campaign_id,bid
    radid_to_attributes = record["paid_radids_to_attributes"]
    for rad_id, attributes in radid_to_attributes.items():
        buyer_site_id = rad_info[int(rad_id)][0]
        buyer_campaign_id = rad_info[int(rad_id)][1]
        bid = campaignid_to_bid[buyer_campaign_id]
        radranking_pool['paid_radid_to_attributes'][rad_id] = (
            # force impressions at least to be 500 so rads without enough imps
            # won't dominate high ranks in the simulation
            max(attributes['rad_impressions'], MINIMUM_IMPRESSIONS),
            attributes['rad_clicks'],
# buyer_site_id and buyer_campaign_id may be used in the future when generating
# reasonable CTR for each buyer_site or for each buyer_camapign in the case
# when buyer_rads haven't received enough imps
            buyer_site_id,
            buyer_campaign_id,
            bid
        )
    radrankings_with_bid_fw.write(json.dumps(radranking_pool) + "\n")
radrankings_with_bid_fw.close()

# part 4: simulation: for each seller url, estimate the probability
# the target campaign ranks the first/second place

rad_position_probability_file = DATA_PATH + '/_rad_probability.txt'
print('4) Simulation (%s)...' % rad_position_probability_file)

rads_set = set(json.loads(open(buyer_rads_file).read()))

rad_prod_fw = open(rad_position_probability_file, 'w')
num_processed_lines_part4 = 0
for content in open(radrankings_with_bid_file):
    if random.uniform(0, 1) > SAMPLE_PROBABILITY:
        continue

    num_processed_lines_part4 += 1
    if num_processed_lines_part4 % 10000 == 0:
        print 'num_processed_lines_part4={}'.format(num_processed_lines_part4)

    record = json.loads(content)
    url = record['url']
    radid_to_attributes = record['paid_radid_to_attributes']

    rad_rank_by_ecpm = []
    rad_position0 = []
    rad_position1 = []
    if USE_NUMPY:
        ecpm = dict((iter, {}) for iter in range(SIMULATION_ITERATIONS))
        for rad_id, attributes in radid_to_attributes.items():
            impressions = attributes[0]
            clicks = attributes[1]
            bid = attributes[4]
            # TODO(kevinx|ming): use matrices operations from numpy for speed
            for iter, beta in enumerate(numpy.random.beta(
                clicks + ALPHA, impressions + BETA,
                    size=SIMULATION_ITERATIONS)):
                ecpm[iter][str(rad_id)] = bid * beta

        for iter in range(SIMULATION_ITERATIONS):
            # rank rad by ecpm with descending order
            _ecpm = ecpm[iter]
            rad_rank_by_ecpm.append(sorted(_ecpm, key=_ecpm.get,
                                           reverse=True))
            rad_position0.append(rad_rank_by_ecpm[iter][0])
            rad_position1.append(rad_rank_by_ecpm[iter][1])
    else:
        ecpm = {}
        for iter in range(SIMULATION_ITERATIONS):
            ecpm[iter] = {}
            for rad_id, attributes in radid_to_attributes.items():
                impressions = attributes[0]
                clicks = attributes[1]
                bid = attributes[4]
                ecpm[iter][str(rad_id)] = random.betavariate(
                    clicks + ALPHA, impressions + BETA) * bid
            # rank rad by ecpm with descending order
            rad_rank_by_ecpm.append(sorted(ecpm[iter], key=ecpm[iter].get,
                                           reverse=True))
            # append the rad ids
            rad_position0.append(rad_rank_by_ecpm[iter][0])
            rad_position1.append(rad_rank_by_ecpm[iter][1])

    prob = {'url': url}
    counts_first = 0.0
    for rad in rad_position0:
        if rad in rads_set:
            counts_first += 1
    prob['probability_position0'] = round(counts_first /
                                          SIMULATION_ITERATIONS, 4)

    counts_second = 0.0
    for rad in rad_position1:
        if rad in rads_set:
            counts_second += 1
    prob['probability_position1'] = round(counts_second /
                                          SIMULATION_ITERATIONS, 4)

    rad_prod_fw.write(json.dumps(prob) + "\n")

radrankings_with_bid_fw.close()
rad_prod_fw.close()

# part 5: predictions for impressions of the buyer campaign on seller urls
# These come from pig script:

click_count_fd = open(PIG_CLICK_COUNT_FILE)
# campaign_id, (seller) site_id, content_unit, position, clicks
click_count_info_list = []
for content in click_count_fd:
    click_count_info_list.append([int(x) for x in content.split('\t')])

imp_count_fd = open(PIG_IMP_COUNT_FILE)
# campaign_id, (seller) site_id, content_unit, position, imps
imp_count_info_list = []
for content in imp_count_fd:
    imp_count_info_list.append([int(x) for x in content.split('\t')])

# campaign_id, (seller) site_id, content_unit, position, CTR
_campaign_site_content_position_ctr = []
for _click in click_count_info_list:
    for _imp in imp_count_info_list:
        if (_click[0] == _imp[0] and _click[1] == _imp[1]
        and _click[2] == _imp[2] and _click[3] == _imp[3]):
            campaign_site_content_position_ctr = [
            _imp[0], _imp[1], _imp[2], _imp[3], float(_click[4]) / _imp[4]]
            _campaign_site_content_position_ctr.append(
               campaign_site_content_position_ctr)
if not _campaign_site_content_position_ctr:
    raise RuntimeError(
        "The _campaign_site_content_position_ctr list is empty. "
        "Please make sure %s and %s have intersections." % (
            PIG_CLICK_COUNT_FILE, PIG_IMP_COUNT_FILE))
campaign_site_content_position_ctr = DataFrame(
    _campaign_site_content_position_ctr,
    columns=['campaign_id', 'site_id', 'content_unit', 'position', 'ctr'])

# this is just for simulation: targeted buyer_campaign on one seller_site
# site_id=3: livestrong; site_id=4: ehow; site_id=25: cracked
content_position_ctr = campaign_site_content_position_ctr[
    (campaign_site_content_position_ctr.campaign_id == BUYER_CAMPAIGN_ID)
    & (campaign_site_content_position_ctr.site_id == 3)]

host_url_imp_fd = open(PIG_HOST_URL_IMPS_FILE)
# site_id, host_url, cu_imps
_host_url_imp_info_list = []
for content in host_url_imp_fd:
    content = content.rstrip()
    _host_url_info_tuple = [x for x in content.split('\t')]
    if len(_host_url_info_tuple) != 3:
        print("Warning: %s (line is not properly tab delimited:'%s')" % (
            PIG_HOST_URL_IMPS_FILE, content))
        continue
    site_id, host_url, cu_imps = _host_url_info_tuple
    _host_url_imp_info_list.append((int(site_id), host_url, int(cu_imps)))
host_url_cu_imps = DataFrame(
    _host_url_imp_info_list, columns=['site_id', 'url', 'cu_imps'])

_rad_position_probability_list = []  # url, prob_position0, prob_position1
for content in open(rad_position_probability_file):
    record = json.loads(content)
    list = [
        str(record['url']),
        record['probability_position0'],
        record['probability_position1']
    ]
    _rad_position_probability_list.append(list)
rad_position_probability_list = DataFrame(
    _rad_position_probability_list,
    columns=['url', 'prob_position0', 'prob_position1'])

# map url with positions
url_position = merge(
    host_url_cu_imps,
    rad_position_probability_list,
    on='url',
    how='inner')

num_processed_lines_part5 = 0
pred_imps = pred_clicks = 0  # TODO(ming): need to predict for clicks
var_imps = var_clicks = 0
# url_position contains: site_id, url, cu_imps, prob_position0, prob_position1
for idx in url_position.index:
    cu_imps = url_position['cu_imps'][idx]
    prob_position0 = url_position['prob_position0'][idx]
    prob_position1 = url_position['prob_position1'][idx]

    num_processed_lines_part5 += 1
    if num_processed_lines_part5 % 1000 == 0:
        print 'num_processed_lines_part5 = {}'.format(
        num_processed_lines_part5)
    pred_imps += cu_imps * (prob_position0 + prob_position1)
    # use normal approximation for estimating the variance
    var_imps += cu_imps * (
    prob_position0 * (1 - prob_position0) +
    prob_position1 * (1 - prob_position1))
pred = {}
pred['pred_imps'] = int(pred_imps / SAMPLE_PROBABILITY)
# 1.96 corresponds to z(standard normal)-value of 95% confidence level
# assuming the predicted imps is normally distributed
pred['pred_imps_upper'] = int((pred_imps + 1.96 * sqrt(var_imps)) /
                               SAMPLE_PROBABILITY)
pred['pred_imps_lower'] = int((pred_imps - 1.96 * sqrt(var_imps)) /
                               SAMPLE_PROBABILITY)

prediction_fw = open(DATA_PATH + '/_prediction.txt', 'w')
prediction_fw.write(json.dumps(pred) + "\n")
prediction_fw.close()

Wednesday, October 24, 2012

[Python] Generating Histograms

import matplotlib.pyplot as plt
from numpy.random import normal
a=normal(10,2,size=1000)
plt.hist(a,bins=10)
plt.title("Gaussian Histogram")
plt.xlabel("Value")
plt.ylabel("Frequency")
plt.show()



With the option 'normed=True', it will return the normalized histogram, i.e, probability distribution plot.
plt.hist(a,bins=10,normed=True)
plt.show()
 

With the option 'cumulative=True', it will return the cumulative distribution plot.
plt.hist(a,bins=10,normed=True,cumulative=True)
plt.show()