import simplejson as json
from math import *
import numpy
import os
from pandas import DataFrame
from pandas.tools.merge import merge
import random
import re
USE_NUMPY = True
SAMPLE_PROBABILITY = 0.2 # sampling for 20% of the records to simulate
SIMULATION_ITERATIONS = 20 # simulation times
MINIMUM_IMPRESSIONS = 500
ALPHA = 1
BETA = 10
# specify the buyer_campaign
BUYER_CAMPAIGN_ID = 201
DATA_PATH = os.environ['DATA_PATH'] if 'DATA_PATH' in os.environ else '.'
# Prerequisite input files, needed by this program:
# Example:
# {"status": "active", "title": "Pork Rinds Nutrition Information",
# "url": "http://www.livestrong.com/article/263272-pork-rinds-nutr...",
# "site_id": 3, "campaign_id": 4,
# "image_url": "http://livestrong.com/media/images/category-images/...jpg",
# "id": 15949, "__key__": "Rad:00f70ed87c579203b4521627204912ffb32c29ef..."}
RAD_FILE = DATA_PATH + '/rad.txt'
# Example:
# {"status": "active", "account_id": 3,
# "allowed_countries": ["KR", "SY", "IR", "CU", "SD"], "site_id": 3,
# "date_end": null, "date_start": null, "internal_tracking": "",
# "budget": null, "id": 3, "external_tracking": "",
# "allowed_countries_is_whitelist": false, "link_attribution": "",
# "bid": null, "__key__": "Campaign:3", "name": "LS Top 10,000"}
CAMPAIGN_FILE = DATA_PATH + '/campaign.txt'
# Huge file, example:
# {"free_radids_to_attributes": {
# "196771": {"rad_impressions": 499, "rad_clicks": 0, "is_editorial": null},
# "191140": {"rad_impressions": 38, "rad_clicks": 26, "is_editorial": null},
# ...
# "paid_radids_to_attributes": {
# ... }
# "url": "http://www.ehow.com/12345-some-silly-document.html"}
# TODO(ming|kevinx): document on how to retrieve rankings from Tracker
# For faster run:
# cat radrankings | head -1000 > sample_radrankings.txt
RADRANKINGS_FILE = DATA_PATH + '/sample_radrankings.txt'
# campaign_id, (seller) site_id, content_unit, position, clicks
# 3 3 2 4 99
# 4 3 2 3 261
PIG_CLICK_COUNT_FILE = DATA_PATH + '/click_count.txt'
# campaign_id, (seller) site_id, content_unit, position, imps
# 3 3 2 5 57086
# 7 4 27 0 62846
PIG_IMP_COUNT_FILE = DATA_PATH + '/imp_count.txt'
# site_id, host_url, cu_imps
# 3 http://www.livestrong.com/article/100261-pepsin/ 63
# 3 http://www.livestrong.com/article/24522-tonsils/ 11
PIG_HOST_URL_IMPS_FILE = DATA_PATH + '/host_url_imps.txt'
missing_files = []
for file in (RAD_FILE, CAMPAIGN_FILE, RADRANKINGS_FILE,
PIG_CLICK_COUNT_FILE, PIG_IMP_COUNT_FILE,
PIG_HOST_URL_IMPS_FILE):
if not os.path.exists(file):
missing_files.append(file)
if missing_files:
raise RuntimeError("Unable to find file(s):%s" % missing_files)
# TODO(kevinx): move these to testable subroutines
# PART 1:
# Return the list of buyer_rads_id for a given target seller site & campaign
buyer_rads_file = DATA_PATH + '/_buyer_rads.txt'
print '1) %s -> %s' % (RAD_FILE, buyer_rads_file)
def get_all_radids_from_campaignid(campaign_id=BUYER_CAMPAIGN_ID):
"""
Find all the unique ids that have the campaign_id
"""
rad_set = set()
num_processed_lines_part1 = 0
for content in open(RAD_FILE):
num_processed_lines_part1 += 1
if num_processed_lines_part1 % 500000 == 0:
print('num_processed_lines_part1=%s' % num_processed_lines_part1)
record = json.loads(content)
if record['campaign_id'] == campaign_id:
rad_set.add(str(record['id']))
return rad_set # to make a unique set of rads_id
rad_id_list = list(get_all_radids_from_campaignid())
buyer_rads_fw = open(buyer_rads_file, 'w')
buyer_rads_fw.write(json.dumps(rad_id_list))
buyer_rads_fw.close()
# Part 2: create a list of radranking records that the buyer_rads
# created in part 1 are eligible to show
## need to explore all sellers in the end, but since each seller has different
## frame set-ups and ctr, need to simulate based on each seller
## and then aggregate
domain_to_remain = 'http://www.livestrong.com' # specify the seller
radrankings_eligible_unique_file = (
DATA_PATH + '/_radrankings_eligible_unique.txt')
print '2) %s -> %s' % (RADRANKINGS_FILE,
radrankings_eligible_unique_file)
num_processed_lines_part2 = 0
# convert it to list (otherwise it would be a string)
buyer_rad_ids = json.loads((open(buyer_rads_file)).read())
radrankings_eligible_fw = open(radrankings_eligible_unique_file, 'w')
# TODO(kevinx): allow bzip2 and gz files as input directly
url_seen = set()
duplicate_url_count = 0
for content in open(RADRANKINGS_FILE):
num_processed_lines_part2 += 1
if num_processed_lines_part2 % 100000 == 0:
print 'num_processed_lines_part2={}'.format(num_processed_lines_part2)
record = json.loads(content)
url = record['url']
if url in url_seen:
duplicate_url_count += 1
continue
if (not url.startswith(domain_to_remain) or
not record['paid_radids_to_attributes']):
continue
url_seen.add(url)
radid_to_attributes = record['paid_radids_to_attributes']
rad_id_pool = []
rad_impclick = []
for rad_id in radid_to_attributes.keys():
rad_id_pool.append(str(rad_id))
if len(set(buyer_rad_ids).intersection(set(rad_id_pool))) > 0:
radrankings_eligible_fw.write(content)
radrankings_eligible_fw.close()
if duplicate_url_count:
print("WARNING: Encountered %d duplicate URLs in %s" %
(duplicate_url_count, RADRANKINGS_FILE))
# part 3: for each record generated in radrankings_eligible_unique,
# generate corresponding campaign_id and current bidding price
# for each eligible rad
## map campaign id to latest bid
campaignid_to_bid = {}
for content in open(CAMPAIGN_FILE):
record = json.loads(content)
campaign_id = record['id'] # this is an integer
bid = record['bid']
if record['status'] == 'active' and bid > 0:
campaignid_to_bid[campaign_id] = bid
else:
campaignid_to_bid[campaign_id] = 0
## some campaign_id may not exist now but exists in the file rad.txt
for iter in range(max(campaignid_to_bid.keys()) + 100):
if iter not in campaignid_to_bid.keys():
campaignid_to_bid[iter] = 0
# defined in part1
BUYER_CAMPAIGN_ID = 201
campaignid_to_bid[BUYER_CAMPAIGN_ID] = 0.05 # campaign new bid
radrankings_with_bid_file = DATA_PATH + '/_radrankings_with_bid.txt'
print('3) %s -> %s' %
(radrankings_eligible_unique_file, radrankings_with_bid_file))
rad_info = {} # map rad_id to (...)
for content in open(RAD_FILE):
record = json.loads(content)
rad_id = record["id"]
site_id = record["site_id"]
campaign_id = record["campaign_id"]
rad_info[rad_id] = (site_id, campaign_id)
# now generate a file that contains url and paid rad ids with
# with tuples [impressions, clicks, buyer_site_id, buyer_campaign_id, bid]
radrankings_with_bid_fw = open(radrankings_with_bid_file, 'w')
num_processed_lines_part3 = 0
for content in open(radrankings_eligible_unique_file):
num_processed_lines_part3 += 1
if num_processed_lines_part3 % 10000 == 0:
print 'num_processed_lines_part3={}'.format(num_processed_lines_part3)
record = json.loads(content)
radranking_pool = {
'paid_radid_to_attributes': {},
'url': str(record['url'])
}
#radranking_pool["url"] = str(url)
#radranking_pool["paid"] = {}
# generate the rads that are eligible to show for each host_url.
# each rad includes the info:imp,click,buyer_site_id,buyer_campaign_id,bid
radid_to_attributes = record["paid_radids_to_attributes"]
for rad_id, attributes in radid_to_attributes.items():
buyer_site_id = rad_info[int(rad_id)][0]
buyer_campaign_id = rad_info[int(rad_id)][1]
bid = campaignid_to_bid[buyer_campaign_id]
radranking_pool['paid_radid_to_attributes'][rad_id] = (
# force impressions at least to be 500 so rads without enough imps
# won't dominate high ranks in the simulation
max(attributes['rad_impressions'], MINIMUM_IMPRESSIONS),
attributes['rad_clicks'],
# buyer_site_id and buyer_campaign_id may be used in the future when generating
# reasonable CTR for each buyer_site or for each buyer_camapign in the case
# when buyer_rads haven't received enough imps
buyer_site_id,
buyer_campaign_id,
bid
)
radrankings_with_bid_fw.write(json.dumps(radranking_pool) + "\n")
radrankings_with_bid_fw.close()
# part 4: simulation: for each seller url, estimate the probability
# the target campaign ranks the first/second place
rad_position_probability_file = DATA_PATH + '/_rad_probability.txt'
print('4) Simulation (%s)...' % rad_position_probability_file)
rads_set = set(json.loads(open(buyer_rads_file).read()))
rad_prod_fw = open(rad_position_probability_file, 'w')
num_processed_lines_part4 = 0
for content in open(radrankings_with_bid_file):
if random.uniform(0, 1) > SAMPLE_PROBABILITY:
continue
num_processed_lines_part4 += 1
if num_processed_lines_part4 % 10000 == 0:
print 'num_processed_lines_part4={}'.format(num_processed_lines_part4)
record = json.loads(content)
url = record['url']
radid_to_attributes = record['paid_radid_to_attributes']
rad_rank_by_ecpm = []
rad_position0 = []
rad_position1 = []
if USE_NUMPY:
ecpm = dict((iter, {}) for iter in range(SIMULATION_ITERATIONS))
for rad_id, attributes in radid_to_attributes.items():
impressions = attributes[0]
clicks = attributes[1]
bid = attributes[4]
# TODO(kevinx|ming): use matrices operations from numpy for speed
for iter, beta in enumerate(numpy.random.beta(
clicks + ALPHA, impressions + BETA,
size=SIMULATION_ITERATIONS)):
ecpm[iter][str(rad_id)] = bid * beta
for iter in range(SIMULATION_ITERATIONS):
# rank rad by ecpm with descending order
_ecpm = ecpm[iter]
rad_rank_by_ecpm.append(sorted(_ecpm, key=_ecpm.get,
reverse=True))
rad_position0.append(rad_rank_by_ecpm[iter][0])
rad_position1.append(rad_rank_by_ecpm[iter][1])
else:
ecpm = {}
for iter in range(SIMULATION_ITERATIONS):
ecpm[iter] = {}
for rad_id, attributes in radid_to_attributes.items():
impressions = attributes[0]
clicks = attributes[1]
bid = attributes[4]
ecpm[iter][str(rad_id)] = random.betavariate(
clicks + ALPHA, impressions + BETA) * bid
# rank rad by ecpm with descending order
rad_rank_by_ecpm.append(sorted(ecpm[iter], key=ecpm[iter].get,
reverse=True))
# append the rad ids
rad_position0.append(rad_rank_by_ecpm[iter][0])
rad_position1.append(rad_rank_by_ecpm[iter][1])
prob = {'url': url}
counts_first = 0.0
for rad in rad_position0:
if rad in rads_set:
counts_first += 1
prob['probability_position0'] = round(counts_first /
SIMULATION_ITERATIONS, 4)
counts_second = 0.0
for rad in rad_position1:
if rad in rads_set:
counts_second += 1
prob['probability_position1'] = round(counts_second /
SIMULATION_ITERATIONS, 4)
rad_prod_fw.write(json.dumps(prob) + "\n")
radrankings_with_bid_fw.close()
rad_prod_fw.close()
# part 5: predictions for impressions of the buyer campaign on seller urls
# These come from pig script:
click_count_fd = open(PIG_CLICK_COUNT_FILE)
# campaign_id, (seller) site_id, content_unit, position, clicks
click_count_info_list = []
for content in click_count_fd:
click_count_info_list.append([int(x) for x in content.split('\t')])
imp_count_fd = open(PIG_IMP_COUNT_FILE)
# campaign_id, (seller) site_id, content_unit, position, imps
imp_count_info_list = []
for content in imp_count_fd:
imp_count_info_list.append([int(x) for x in content.split('\t')])
# campaign_id, (seller) site_id, content_unit, position, CTR
_campaign_site_content_position_ctr = []
for _click in click_count_info_list:
for _imp in imp_count_info_list:
if (_click[0] == _imp[0] and _click[1] == _imp[1]
and _click[2] == _imp[2] and _click[3] == _imp[3]):
campaign_site_content_position_ctr = [
_imp[0], _imp[1], _imp[2], _imp[3], float(_click[4]) / _imp[4]]
_campaign_site_content_position_ctr.append(
campaign_site_content_position_ctr)
if not _campaign_site_content_position_ctr:
raise RuntimeError(
"The _campaign_site_content_position_ctr list is empty. "
"Please make sure %s and %s have intersections." % (
PIG_CLICK_COUNT_FILE, PIG_IMP_COUNT_FILE))
campaign_site_content_position_ctr = DataFrame(
_campaign_site_content_position_ctr,
columns=['campaign_id', 'site_id', 'content_unit', 'position', 'ctr'])
# this is just for simulation: targeted buyer_campaign on one seller_site
# site_id=3: livestrong; site_id=4: ehow; site_id=25: cracked
content_position_ctr = campaign_site_content_position_ctr[
(campaign_site_content_position_ctr.campaign_id == BUYER_CAMPAIGN_ID)
& (campaign_site_content_position_ctr.site_id == 3)]
host_url_imp_fd = open(PIG_HOST_URL_IMPS_FILE)
# site_id, host_url, cu_imps
_host_url_imp_info_list = []
for content in host_url_imp_fd:
content = content.rstrip()
_host_url_info_tuple = [x for x in content.split('\t')]
if len(_host_url_info_tuple) != 3:
print("Warning: %s (line is not properly tab delimited:'%s')" % (
PIG_HOST_URL_IMPS_FILE, content))
continue
site_id, host_url, cu_imps = _host_url_info_tuple
_host_url_imp_info_list.append((int(site_id), host_url, int(cu_imps)))
host_url_cu_imps = DataFrame(
_host_url_imp_info_list, columns=['site_id', 'url', 'cu_imps'])
_rad_position_probability_list = [] # url, prob_position0, prob_position1
for content in open(rad_position_probability_file):
record = json.loads(content)
list = [
str(record['url']),
record['probability_position0'],
record['probability_position1']
]
_rad_position_probability_list.append(list)
rad_position_probability_list = DataFrame(
_rad_position_probability_list,
columns=['url', 'prob_position0', 'prob_position1'])
# map url with positions
url_position = merge(
host_url_cu_imps,
rad_position_probability_list,
on='url',
how='inner')
num_processed_lines_part5 = 0
pred_imps = pred_clicks = 0 # TODO(ming): need to predict for clicks
var_imps = var_clicks = 0
# url_position contains: site_id, url, cu_imps, prob_position0, prob_position1
for idx in url_position.index:
cu_imps = url_position['cu_imps'][idx]
prob_position0 = url_position['prob_position0'][idx]
prob_position1 = url_position['prob_position1'][idx]
num_processed_lines_part5 += 1
if num_processed_lines_part5 % 1000 == 0:
print 'num_processed_lines_part5 = {}'.format(
num_processed_lines_part5)
pred_imps += cu_imps * (prob_position0 + prob_position1)
# use normal approximation for estimating the variance
var_imps += cu_imps * (
prob_position0 * (1 - prob_position0) +
prob_position1 * (1 - prob_position1))
pred = {}
pred['pred_imps'] = int(pred_imps / SAMPLE_PROBABILITY)
# 1.96 corresponds to z(standard normal)-value of 95% confidence level
# assuming the predicted imps is normally distributed
pred['pred_imps_upper'] = int((pred_imps + 1.96 * sqrt(var_imps)) /
SAMPLE_PROBABILITY)
pred['pred_imps_lower'] = int((pred_imps - 1.96 * sqrt(var_imps)) /
SAMPLE_PROBABILITY)
prediction_fw = open(DATA_PATH + '/_prediction.txt', 'w')
prediction_fw.write(json.dumps(pred) + "\n")
prediction_fw.close()
No comments:
Post a Comment