Statistics/Data Mining Applications: 2012

import simplejson as json
from math import *
import numpy
import os
from pandas import DataFrame
from pandas.tools.merge import merge
import random
import re

USE_NUMPY = True
SAMPLE_PROBABILITY = 0.2   # sampling for 20% of the records to simulate
SIMULATION_ITERATIONS = 20 # simulation times
MINIMUM_IMPRESSIONS = 500
ALPHA = 1
BETA = 10

# specify the buyer_campaign
BUYER_CAMPAIGN_ID = 201

DATA_PATH = os.environ['DATA_PATH'] if 'DATA_PATH' in os.environ else '.'

# Prerequisite input files, needed by this program:

# Example:
# {"status": "active", "title": "Pork Rinds Nutrition Information",
# "url": "http://www.livestrong.com/article/263272-pork-rinds-nutr...",
# "site_id": 3, "campaign_id": 4,
# "image_url": "http://livestrong.com/media/images/category-images/...jpg",
# "id": 15949, "__key__": "Rad:00f70ed87c579203b4521627204912ffb32c29ef..."}
RAD_FILE = DATA_PATH + '/rad.txt'

# Example:
# {"status": "active", "account_id": 3,
# "allowed_countries": ["KR", "SY", "IR", "CU", "SD"], "site_id": 3,
# "date_end": null, "date_start": null, "internal_tracking": "",
# "budget": null, "id": 3, "external_tracking": "",
# "allowed_countries_is_whitelist": false, "link_attribution": "",
# "bid": null, "__key__": "Campaign:3", "name": "LS Top 10,000"}
CAMPAIGN_FILE = DATA_PATH + '/campaign.txt'

# Huge file, example:
# {"free_radids_to_attributes": {
#    "196771": {"rad_impressions": 499, "rad_clicks": 0, "is_editorial": null},
#    "191140": {"rad_impressions": 38, "rad_clicks": 26, "is_editorial": null},
#    ...
# "paid_radids_to_attributes": {
#    ... }
# "url": "http://www.ehow.com/12345-some-silly-document.html"}
# TODO(ming|kevinx): document on how to retrieve rankings from Tracker
# For faster run:
# cat radrankings | head -1000 > sample_radrankings.txt
RADRANKINGS_FILE = DATA_PATH + '/sample_radrankings.txt'

# campaign_id, (seller) site_id, content_unit, position, clicks
# 3       3       2       4       99
# 4       3       2       3       261
PIG_CLICK_COUNT_FILE = DATA_PATH + '/click_count.txt'

# campaign_id, (seller) site_id, content_unit, position, imps
# 3       3       2       5       57086
# 7       4       27      0       62846
PIG_IMP_COUNT_FILE = DATA_PATH + '/imp_count.txt'

# site_id, host_url, cu_imps
# 3       http://www.livestrong.com/article/100261-pepsin/        63
# 3       http://www.livestrong.com/article/24522-tonsils/        11
PIG_HOST_URL_IMPS_FILE = DATA_PATH + '/host_url_imps.txt'

missing_files = []
for file in (RAD_FILE, CAMPAIGN_FILE, RADRANKINGS_FILE,
             PIG_CLICK_COUNT_FILE, PIG_IMP_COUNT_FILE,
             PIG_HOST_URL_IMPS_FILE):
    if not os.path.exists(file):
        missing_files.append(file)
if missing_files:
    raise RuntimeError("Unable to find file(s):%s" % missing_files)

# TODO(kevinx): move these to testable subroutines

# PART 1:
# Return the list of buyer_rads_id for a given target seller site & campaign
buyer_rads_file = DATA_PATH + '/_buyer_rads.txt'
print '1) %s -> %s' % (RAD_FILE, buyer_rads_file)

def get_all_radids_from_campaignid(campaign_id=BUYER_CAMPAIGN_ID):
    """
    Find all the unique ids that have the campaign_id
    """
    rad_set = set()
    num_processed_lines_part1 = 0
    for content in open(RAD_FILE):
        num_processed_lines_part1 += 1
        if num_processed_lines_part1 % 500000 == 0:
            print('num_processed_lines_part1=%s' % num_processed_lines_part1)
        record = json.loads(content)
        if record['campaign_id'] == campaign_id:
            rad_set.add(str(record['id']))
    return rad_set   # to make a unique set of rads_id

rad_id_list = list(get_all_radids_from_campaignid())
buyer_rads_fw = open(buyer_rads_file, 'w')
buyer_rads_fw.write(json.dumps(rad_id_list))
buyer_rads_fw.close()

# Part 2: create a list of radranking records that the buyer_rads
# created in part 1 are eligible to show

## need to explore all sellers in the end, but since each seller has different
## frame set-ups and ctr, need to simulate based on each seller
## and then aggregate
domain_to_remain = 'http://www.livestrong.com' # specify the seller

radrankings_eligible_unique_file = (
    DATA_PATH + '/_radrankings_eligible_unique.txt')
print '2) %s -> %s' % (RADRANKINGS_FILE,
                       radrankings_eligible_unique_file)
num_processed_lines_part2 = 0
# convert it to list (otherwise it would be a string)
buyer_rad_ids = json.loads((open(buyer_rads_file)).read())

radrankings_eligible_fw = open(radrankings_eligible_unique_file, 'w')
# TODO(kevinx): allow bzip2 and gz files as input directly
url_seen = set()
duplicate_url_count = 0
for content in open(RADRANKINGS_FILE):
    num_processed_lines_part2 += 1
    if num_processed_lines_part2 % 100000 == 0:
        print 'num_processed_lines_part2={}'.format(num_processed_lines_part2)

    record = json.loads(content)
    url = record['url']
    if url in url_seen:
        duplicate_url_count += 1
        continue

    if (not url.startswith(domain_to_remain) or
            not record['paid_radids_to_attributes']):
        continue

    url_seen.add(url)
    radid_to_attributes = record['paid_radids_to_attributes']
    rad_id_pool = []
    rad_impclick = []
    for rad_id in radid_to_attributes.keys():
        rad_id_pool.append(str(rad_id))

    if len(set(buyer_rad_ids).intersection(set(rad_id_pool))) > 0:
        radrankings_eligible_fw.write(content)
radrankings_eligible_fw.close()
if duplicate_url_count:
    print("WARNING: Encountered %d duplicate URLs in %s" %
          (duplicate_url_count, RADRANKINGS_FILE))

# part 3: for each record generated in radrankings_eligible_unique,
# generate corresponding campaign_id and current bidding price
# for each eligible rad

## map campaign id to latest bid
campaignid_to_bid = {}
for content in open(CAMPAIGN_FILE):
    record = json.loads(content)
    campaign_id = record['id'] # this is an integer
    bid = record['bid']
    if record['status'] == 'active' and bid > 0:
        campaignid_to_bid[campaign_id] = bid
    else:
        campaignid_to_bid[campaign_id] = 0

## some campaign_id may not exist now but exists in the file rad.txt
for iter in range(max(campaignid_to_bid.keys()) + 100):
    if iter not in campaignid_to_bid.keys():
        campaignid_to_bid[iter] = 0

# defined in part1
BUYER_CAMPAIGN_ID = 201
campaignid_to_bid[BUYER_CAMPAIGN_ID] = 0.05   # campaign new bid

radrankings_with_bid_file = DATA_PATH + '/_radrankings_with_bid.txt'
print('3) %s -> %s' %
      (radrankings_eligible_unique_file, radrankings_with_bid_file))

rad_info = {} # map rad_id to (...)
for content in open(RAD_FILE):
    record = json.loads(content)
    rad_id = record["id"]
    site_id = record["site_id"]
    campaign_id = record["campaign_id"]
    rad_info[rad_id] = (site_id, campaign_id)

# now generate a file that contains url and paid rad ids with
# with tuples [impressions, clicks, buyer_site_id, buyer_campaign_id, bid]
radrankings_with_bid_fw = open(radrankings_with_bid_file, 'w')
num_processed_lines_part3 = 0
for content in open(radrankings_eligible_unique_file):
    num_processed_lines_part3 += 1
    if num_processed_lines_part3 % 10000 == 0:
        print 'num_processed_lines_part3={}'.format(num_processed_lines_part3)

    record = json.loads(content)
    radranking_pool = {
        'paid_radid_to_attributes': {},
        'url': str(record['url'])
    }
    #radranking_pool["url"] = str(url)
    #radranking_pool["paid"] = {}

    # generate the rads that are eligible to show for each host_url.
    # each rad includes the info:imp,click,buyer_site_id,buyer_campaign_id,bid
    radid_to_attributes = record["paid_radids_to_attributes"]
    for rad_id, attributes in radid_to_attributes.items():
        buyer_site_id = rad_info[int(rad_id)][0]
        buyer_campaign_id = rad_info[int(rad_id)][1]
        bid = campaignid_to_bid[buyer_campaign_id]
        radranking_pool['paid_radid_to_attributes'][rad_id] = (
            # force impressions at least to be 500 so rads without enough imps
            # won't dominate high ranks in the simulation
            max(attributes['rad_impressions'], MINIMUM_IMPRESSIONS),
            attributes['rad_clicks'],
# buyer_site_id and buyer_campaign_id may be used in the future when generating
# reasonable CTR for each buyer_site or for each buyer_camapign in the case
# when buyer_rads haven't received enough imps
            buyer_site_id,
            buyer_campaign_id,
            bid
        )
    radrankings_with_bid_fw.write(json.dumps(radranking_pool) + "\n")
radrankings_with_bid_fw.close()

# part 4: simulation: for each seller url, estimate the probability
# the target campaign ranks the first/second place

rad_position_probability_file = DATA_PATH + '/_rad_probability.txt'
print('4) Simulation (%s)...' % rad_position_probability_file)

rads_set = set(json.loads(open(buyer_rads_file).read()))

rad_prod_fw = open(rad_position_probability_file, 'w')
num_processed_lines_part4 = 0
for content in open(radrankings_with_bid_file):
    if random.uniform(0, 1) > SAMPLE_PROBABILITY:
        continue

    num_processed_lines_part4 += 1
    if num_processed_lines_part4 % 10000 == 0:
        print 'num_processed_lines_part4={}'.format(num_processed_lines_part4)

    record = json.loads(content)
    url = record['url']
    radid_to_attributes = record['paid_radid_to_attributes']

    rad_rank_by_ecpm = []
    rad_position0 = []
    rad_position1 = []
    if USE_NUMPY:
        ecpm = dict((iter, {}) for iter in range(SIMULATION_ITERATIONS))
        for rad_id, attributes in radid_to_attributes.items():
            impressions = attributes[0]
            clicks = attributes[1]
            bid = attributes[4]
            # TODO(kevinx|ming): use matrices operations from numpy for speed
            for iter, beta in enumerate(numpy.random.beta(
                clicks + ALPHA, impressions + BETA,
                    size=SIMULATION_ITERATIONS)):
                ecpm[iter][str(rad_id)] = bid * beta

        for iter in range(SIMULATION_ITERATIONS):
            # rank rad by ecpm with descending order
            _ecpm = ecpm[iter]
            rad_rank_by_ecpm.append(sorted(_ecpm, key=_ecpm.get,
                                           reverse=True))
            rad_position0.append(rad_rank_by_ecpm[iter][0])
            rad_position1.append(rad_rank_by_ecpm[iter][1])
    else:
        ecpm = {}
        for iter in range(SIMULATION_ITERATIONS):
            ecpm[iter] = {}
            for rad_id, attributes in radid_to_attributes.items():
                impressions = attributes[0]
                clicks = attributes[1]
                bid = attributes[4]
                ecpm[iter][str(rad_id)] = random.betavariate(
                    clicks + ALPHA, impressions + BETA) * bid
            # rank rad by ecpm with descending order
            rad_rank_by_ecpm.append(sorted(ecpm[iter], key=ecpm[iter].get,
                                           reverse=True))
            # append the rad ids
            rad_position0.append(rad_rank_by_ecpm[iter][0])
            rad_position1.append(rad_rank_by_ecpm[iter][1])

    prob = {'url': url}
    counts_first = 0.0
    for rad in rad_position0:
        if rad in rads_set:
            counts_first += 1
    prob['probability_position0'] = round(counts_first /
                                          SIMULATION_ITERATIONS, 4)

    counts_second = 0.0
    for rad in rad_position1:
        if rad in rads_set:
            counts_second += 1
    prob['probability_position1'] = round(counts_second /
                                          SIMULATION_ITERATIONS, 4)

    rad_prod_fw.write(json.dumps(prob) + "\n")

radrankings_with_bid_fw.close()
rad_prod_fw.close()

# part 5: predictions for impressions of the buyer campaign on seller urls
# These come from pig script:

click_count_fd = open(PIG_CLICK_COUNT_FILE)
# campaign_id, (seller) site_id, content_unit, position, clicks
click_count_info_list = []
for content in click_count_fd:
    click_count_info_list.append([int(x) for x in content.split('\t')])

imp_count_fd = open(PIG_IMP_COUNT_FILE)
# campaign_id, (seller) site_id, content_unit, position, imps
imp_count_info_list = []
for content in imp_count_fd:
    imp_count_info_list.append([int(x) for x in content.split('\t')])

# campaign_id, (seller) site_id, content_unit, position, CTR
_campaign_site_content_position_ctr = []
for _click in click_count_info_list:
    for _imp in imp_count_info_list:
        if (_click[0] == _imp[0] and _click[1] == _imp[1]
        and _click[2] == _imp[2] and _click[3] == _imp[3]):
            campaign_site_content_position_ctr = [
            _imp[0], _imp[1], _imp[2], _imp[3], float(_click[4]) / _imp[4]]
            _campaign_site_content_position_ctr.append(
               campaign_site_content_position_ctr)
if not _campaign_site_content_position_ctr:
    raise RuntimeError(
        "The _campaign_site_content_position_ctr list is empty. "
        "Please make sure %s and %s have intersections." % (
            PIG_CLICK_COUNT_FILE, PIG_IMP_COUNT_FILE))
campaign_site_content_position_ctr = DataFrame(
    _campaign_site_content_position_ctr,
    columns=['campaign_id', 'site_id', 'content_unit', 'position', 'ctr'])

# this is just for simulation: targeted buyer_campaign on one seller_site
# site_id=3: livestrong; site_id=4: ehow; site_id=25: cracked
content_position_ctr = campaign_site_content_position_ctr[
    (campaign_site_content_position_ctr.campaign_id == BUYER_CAMPAIGN_ID)
    & (campaign_site_content_position_ctr.site_id == 3)]

host_url_imp_fd = open(PIG_HOST_URL_IMPS_FILE)
# site_id, host_url, cu_imps
_host_url_imp_info_list = []
for content in host_url_imp_fd:
    content = content.rstrip()
    _host_url_info_tuple = [x for x in content.split('\t')]
    if len(_host_url_info_tuple) != 3:
        print("Warning: %s (line is not properly tab delimited:'%s')" % (
            PIG_HOST_URL_IMPS_FILE, content))
        continue
    site_id, host_url, cu_imps = _host_url_info_tuple
    _host_url_imp_info_list.append((int(site_id), host_url, int(cu_imps)))
host_url_cu_imps = DataFrame(
    _host_url_imp_info_list, columns=['site_id', 'url', 'cu_imps'])

_rad_position_probability_list = [] # url, prob_position0, prob_position1
for content in open(rad_position_probability_file):
    record = json.loads(content)
    list = [
        str(record['url']),
        record['probability_position0'],
        record['probability_position1']
    ]
    _rad_position_probability_list.append(list)
rad_position_probability_list = DataFrame(
    _rad_position_probability_list,
    columns=['url', 'prob_position0', 'prob_position1'])

# map url with positions
url_position = merge(
    host_url_cu_imps,
    rad_position_probability_list,
    on='url',
    how='inner')

num_processed_lines_part5 = 0
pred_imps = pred_clicks = 0 # TODO(ming): need to predict for clicks
var_imps = var_clicks = 0
# url_position contains: site_id, url, cu_imps, prob_position0, prob_position1
for idx in url_position.index:
    cu_imps = url_position['cu_imps'][idx]
    prob_position0 = url_position['prob_position0'][idx]
    prob_position1 = url_position['prob_position1'][idx]

    num_processed_lines_part5 += 1
    if num_processed_lines_part5 % 1000 == 0:
        print 'num_processed_lines_part5 = {}'.format(
        num_processed_lines_part5)
    pred_imps += cu_imps * (prob_position0 + prob_position1)
    # use normal approximation for estimating the variance
    var_imps += cu_imps * (
    prob_position0 * (1 - prob_position0) +
    prob_position1 * (1 - prob_position1))
pred = {}
pred['pred_imps'] = int(pred_imps / SAMPLE_PROBABILITY)
# 1.96 corresponds to z(standard normal)-value of 95% confidence level
# assuming the predicted imps is normally distributed
pred['pred_imps_upper'] = int((pred_imps + 1.96 * sqrt(var_imps)) /
                               SAMPLE_PROBABILITY)
pred['pred_imps_lower'] = int((pred_imps - 1.96 * sqrt(var_imps)) /
                               SAMPLE_PROBABILITY)

prediction_fw = open(DATA_PATH + '/_prediction.txt', 'w')
prediction_fw.write(json.dumps(pred) + "\n")
prediction_fw.close()

In pattern recognition, the k-nearest neighbor algorithm (k-NN) is a method for classifying objects based on closest training examples in the feature space.

The training examples are vectors in a multidimensional feature space, each with a class label. The training phase of the algorithm consists only of storing the feature vectors and class labels of the training samples.

In the classification phase, k is a user-defined constant, and an unlabeled vector (a query or test point) is classified by assigning the label which is most frequent among the k training samples nearest to that query point.

Usually Euclidean distance is used as the distance metric; however this is only applicable to continuous variables. In cases such as text classification, another metric such as the overlap metric (or Hamming distance) can be used. Often, the classification accuracy of "k"-NN can be improved significantly if the distance metric is learned with specialized algorithms such as Large Margin Nearest Neighbor or Neighbourhood components analysis.

The best choice of k depends upon the data; generally, larger values of k reduce the effect of noise on the classification, but make boundaries between classes less distinct. A good k can be selected by various heuristic techniques, for example, cross-validation (for example, choose the value of k by minimizing mis-classification rate). The special case where the class is predicted to be the class of the closest training sample (i.e. when k = 1) is called the nearest neighbor algorithm.

A drawback to the basic "majority voting" classification is that the classes with the more frequent examples tend to dominate the prediction of the new vector, as they tend to come up in the k nearest neighbors when the neighbors are computed due to their large number. One way to overcome this problem is to weigh the classification taking into account the distance from the test point to each of its k nearest neighbors.

Python code:
import numpy as np
import matplotlib.pyplot as plt
import mlpy
np.random.seed(0)
mean1, cov1, n1 = [1, 5], [[1,1],[1,2]], 200 # 200 samples of class 1
x1 = np.random.multivariate_normal(mean1, cov1, n1)
y1 = np.ones(n1, dtype=np.int)
mean2, cov2, n2 = [2.5, 2.5], [[1,0],[0,1]], 300 # 300 samples of class 2
x2 = np.random.multivariate_normal(mean2, cov2, n2)
y2 = 2 * np.ones(n2, dtype=np.int)
mean3, cov3, n3 = [5, 8], [[0.5,0],[0,0.5]], 200 # 200 samples of class 3
x3 = np.random.multivariate_normal(mean3, cov3, n3)
y3 = 3 * np.ones(n3, dtype=np.int)
x = np.concatenate((x1, x2, x3), axis=0) # concatenate the samples
y = np.concatenate((y1, y2, y3))
knn = mlpy.KNN(k=3)
knn.learn(x, y)
knn.nclasses() # return the number of classes
xmin, xmax = x[:,0].min()-1, x[:,0].max()+1
ymin, ymax = x[:,1].min()-1, x[:,1].max()+1
xx, yy = np.meshgrid(np.arange(xmin, xmax, 0.1), np.arange(ymin, ymax, 0.1))
xnew = np.c_[xx.ravel(), yy.ravel()]    # testing data
ynew = knn.pred(xnew).reshape(xx.shape)   # Predict KNN model on a test point
ynew[ynew == 0] = 1 # set the samples with no unique classification to 1
fig = plt.figure(1)
cmap = plt.set_cmap(plt.cm.Paired)
plot1 = plt.pcolormesh(xx, yy, ynew)   # plot the separated regions with different color
plot2 = plt.scatter(x[:,0], x[:,1], c=y) # plot the scatter plot of the data
plt.show()

Statistics/Data Mining Applications

Saturday, November 24, 2012

[Python] Traffic Prediction for advertisers

Wednesday, October 24, 2012

[Python] Generating Histograms

[Pyhon] Data Analysis toolkit 'pandas'

Tuesday, October 2, 2012

[Python] K-nearest Neighbor classification

Wednesday, September 26, 2012

［Python］Elastic Net Classifier