import simplejson as json
from math import *
import numpy
import os
from pandas import DataFrame
from pandas.tools.merge import merge
import random
import re
USE_NUMPY = True
SAMPLE_PROBABILITY = 0.2 # sampling for 20% of the records to simulate
SIMULATION_ITERATIONS = 20 # simulation times
MINIMUM_IMPRESSIONS = 500
ALPHA = 1
BETA = 10
# specify the buyer_campaign
BUYER_CAMPAIGN_ID = 201
DATA_PATH = os.environ['DATA_PATH'] if 'DATA_PATH' in os.environ else '.'
# Prerequisite input files, needed by this program:
# Example:
# {"status": "active", "title": "Pork Rinds Nutrition Information",
# "url": "http://www.livestrong.com/article/263272-pork-rinds-nutr...",
# "site_id": 3, "campaign_id": 4,
# "image_url": "http://livestrong.com/media/images/category-images/...jpg",
# "id": 15949, "__key__": "Rad:00f70ed87c579203b4521627204912ffb32c29ef..."}
RAD_FILE = DATA_PATH + '/rad.txt'
# Example:
# {"status": "active", "account_id": 3,
# "allowed_countries": ["KR", "SY", "IR", "CU", "SD"], "site_id": 3,
# "date_end": null, "date_start": null, "internal_tracking": "",
# "budget": null, "id": 3, "external_tracking": "",
# "allowed_countries_is_whitelist": false, "link_attribution": "",
# "bid": null, "__key__": "Campaign:3", "name": "LS Top 10,000"}
CAMPAIGN_FILE = DATA_PATH + '/campaign.txt'
# Huge file, example:
# {"free_radids_to_attributes": {
# "196771": {"rad_impressions": 499, "rad_clicks": 0, "is_editorial": null},
# "191140": {"rad_impressions": 38, "rad_clicks": 26, "is_editorial": null},
# ...
# "paid_radids_to_attributes": {
# ... }
# "url": "http://www.ehow.com/12345-some-silly-document.html"}
# TODO(ming|kevinx): document on how to retrieve rankings from Tracker
# For faster run:
# cat radrankings | head -1000 > sample_radrankings.txt
RADRANKINGS_FILE = DATA_PATH + '/sample_radrankings.txt'
# campaign_id, (seller) site_id, content_unit, position, clicks
# 3 3 2 4 99
# 4 3 2 3 261
PIG_CLICK_COUNT_FILE = DATA_PATH + '/click_count.txt'
# campaign_id, (seller) site_id, content_unit, position, imps
# 3 3 2 5 57086
# 7 4 27 0 62846
PIG_IMP_COUNT_FILE = DATA_PATH + '/imp_count.txt'
# site_id, host_url, cu_imps
# 3 http://www.livestrong.com/article/100261-pepsin/ 63
# 3 http://www.livestrong.com/article/24522-tonsils/ 11
PIG_HOST_URL_IMPS_FILE = DATA_PATH + '/host_url_imps.txt'
missing_files = []
for file in (RAD_FILE, CAMPAIGN_FILE, RADRANKINGS_FILE,
PIG_CLICK_COUNT_FILE, PIG_IMP_COUNT_FILE,
PIG_HOST_URL_IMPS_FILE):
if not os.path.exists(file):
missing_files.append(file)
if missing_files:
raise RuntimeError("Unable to find file(s):%s" % missing_files)
# TODO(kevinx): move these to testable subroutines
# PART 1:
# Return the list of buyer_rads_id for a given target seller site & campaign
buyer_rads_file = DATA_PATH + '/_buyer_rads.txt'
print '1) %s -> %s' % (RAD_FILE, buyer_rads_file)
def get_all_radids_from_campaignid(campaign_id=BUYER_CAMPAIGN_ID):
"""
Find all the unique ids that have the campaign_id
"""
rad_set = set()
num_processed_lines_part1 = 0
for content in open(RAD_FILE):
num_processed_lines_part1 += 1
if num_processed_lines_part1 % 500000 == 0:
print('num_processed_lines_part1=%s' % num_processed_lines_part1)
record = json.loads(content)
if record['campaign_id'] == campaign_id:
rad_set.add(str(record['id']))
return rad_set # to make a unique set of rads_id
rad_id_list = list(get_all_radids_from_campaignid())
buyer_rads_fw = open(buyer_rads_file, 'w')
buyer_rads_fw.write(json.dumps(rad_id_list))
buyer_rads_fw.close()
# Part 2: create a list of radranking records that the buyer_rads
# created in part 1 are eligible to show
## need to explore all sellers in the end, but since each seller has different
## frame set-ups and ctr, need to simulate based on each seller
## and then aggregate
domain_to_remain = 'http://www.livestrong.com' # specify the seller
radrankings_eligible_unique_file = (
DATA_PATH + '/_radrankings_eligible_unique.txt')
print '2) %s -> %s' % (RADRANKINGS_FILE,
radrankings_eligible_unique_file)
num_processed_lines_part2 = 0
# convert it to list (otherwise it would be a string)
buyer_rad_ids = json.loads((open(buyer_rads_file)).read())
radrankings_eligible_fw = open(radrankings_eligible_unique_file, 'w')
# TODO(kevinx): allow bzip2 and gz files as input directly
url_seen = set()
duplicate_url_count = 0
for content in open(RADRANKINGS_FILE):
num_processed_lines_part2 += 1
if num_processed_lines_part2 % 100000 == 0:
print 'num_processed_lines_part2={}'.format(num_processed_lines_part2)
record = json.loads(content)
url = record['url']
if url in url_seen:
duplicate_url_count += 1
continue
if (not url.startswith(domain_to_remain) or
not record['paid_radids_to_attributes']):
continue
url_seen.add(url)
radid_to_attributes = record['paid_radids_to_attributes']
rad_id_pool = []
rad_impclick = []
for rad_id in radid_to_attributes.keys():
rad_id_pool.append(str(rad_id))
if len(set(buyer_rad_ids).intersection(set(rad_id_pool))) > 0:
radrankings_eligible_fw.write(content)
radrankings_eligible_fw.close()
if duplicate_url_count:
print("WARNING: Encountered %d duplicate URLs in %s" %
(duplicate_url_count, RADRANKINGS_FILE))
# part 3: for each record generated in radrankings_eligible_unique,
# generate corresponding campaign_id and current bidding price
# for each eligible rad
## map campaign id to latest bid
campaignid_to_bid = {}
for content in open(CAMPAIGN_FILE):
record = json.loads(content)
campaign_id = record['id'] # this is an integer
bid = record['bid']
if record['status'] == 'active' and bid > 0:
campaignid_to_bid[campaign_id] = bid
else:
campaignid_to_bid[campaign_id] = 0
## some campaign_id may not exist now but exists in the file rad.txt
for iter in range(max(campaignid_to_bid.keys()) + 100):
if iter not in campaignid_to_bid.keys():
campaignid_to_bid[iter] = 0
# defined in part1
BUYER_CAMPAIGN_ID = 201
campaignid_to_bid[BUYER_CAMPAIGN_ID] = 0.05 # campaign new bid
radrankings_with_bid_file = DATA_PATH + '/_radrankings_with_bid.txt'
print('3) %s -> %s' %
(radrankings_eligible_unique_file, radrankings_with_bid_file))
rad_info = {} # map rad_id to (...)
for content in open(RAD_FILE):
record = json.loads(content)
rad_id = record["id"]
site_id = record["site_id"]
campaign_id = record["campaign_id"]
rad_info[rad_id] = (site_id, campaign_id)
# now generate a file that contains url and paid rad ids with
# with tuples [impressions, clicks, buyer_site_id, buyer_campaign_id, bid]
radrankings_with_bid_fw = open(radrankings_with_bid_file, 'w')
num_processed_lines_part3 = 0
for content in open(radrankings_eligible_unique_file):
num_processed_lines_part3 += 1
if num_processed_lines_part3 % 10000 == 0:
print 'num_processed_lines_part3={}'.format(num_processed_lines_part3)
record = json.loads(content)
radranking_pool = {
'paid_radid_to_attributes': {},
'url': str(record['url'])
}
#radranking_pool["url"] = str(url)
#radranking_pool["paid"] = {}
# generate the rads that are eligible to show for each host_url.
# each rad includes the info:imp,click,buyer_site_id,buyer_campaign_id,bid
radid_to_attributes = record["paid_radids_to_attributes"]
for rad_id, attributes in radid_to_attributes.items():
buyer_site_id = rad_info[int(rad_id)][0]
buyer_campaign_id = rad_info[int(rad_id)][1]
bid = campaignid_to_bid[buyer_campaign_id]
radranking_pool['paid_radid_to_attributes'][rad_id] = (
# force impressions at least to be 500 so rads without enough imps
# won't dominate high ranks in the simulation
max(attributes['rad_impressions'], MINIMUM_IMPRESSIONS),
attributes['rad_clicks'],
# buyer_site_id and buyer_campaign_id may be used in the future when generating
# reasonable CTR for each buyer_site or for each buyer_camapign in the case
# when buyer_rads haven't received enough imps
buyer_site_id,
buyer_campaign_id,
bid
)
radrankings_with_bid_fw.write(json.dumps(radranking_pool) + "\n")
radrankings_with_bid_fw.close()
# part 4: simulation: for each seller url, estimate the probability
# the target campaign ranks the first/second place
rad_position_probability_file = DATA_PATH + '/_rad_probability.txt'
print('4) Simulation (%s)...' % rad_position_probability_file)
rads_set = set(json.loads(open(buyer_rads_file).read()))
rad_prod_fw = open(rad_position_probability_file, 'w')
num_processed_lines_part4 = 0
for content in open(radrankings_with_bid_file):
if random.uniform(0, 1) > SAMPLE_PROBABILITY:
continue
num_processed_lines_part4 += 1
if num_processed_lines_part4 % 10000 == 0:
print 'num_processed_lines_part4={}'.format(num_processed_lines_part4)
record = json.loads(content)
url = record['url']
radid_to_attributes = record['paid_radid_to_attributes']
rad_rank_by_ecpm = []
rad_position0 = []
rad_position1 = []
if USE_NUMPY:
ecpm = dict((iter, {}) for iter in range(SIMULATION_ITERATIONS))
for rad_id, attributes in radid_to_attributes.items():
impressions = attributes[0]
clicks = attributes[1]
bid = attributes[4]
# TODO(kevinx|ming): use matrices operations from numpy for speed
for iter, beta in enumerate(numpy.random.beta(
clicks + ALPHA, impressions + BETA,
size=SIMULATION_ITERATIONS)):
ecpm[iter][str(rad_id)] = bid * beta
for iter in range(SIMULATION_ITERATIONS):
# rank rad by ecpm with descending order
_ecpm = ecpm[iter]
rad_rank_by_ecpm.append(sorted(_ecpm, key=_ecpm.get,
reverse=True))
rad_position0.append(rad_rank_by_ecpm[iter][0])
rad_position1.append(rad_rank_by_ecpm[iter][1])
else:
ecpm = {}
for iter in range(SIMULATION_ITERATIONS):
ecpm[iter] = {}
for rad_id, attributes in radid_to_attributes.items():
impressions = attributes[0]
clicks = attributes[1]
bid = attributes[4]
ecpm[iter][str(rad_id)] = random.betavariate(
clicks + ALPHA, impressions + BETA) * bid
# rank rad by ecpm with descending order
rad_rank_by_ecpm.append(sorted(ecpm[iter], key=ecpm[iter].get,
reverse=True))
# append the rad ids
rad_position0.append(rad_rank_by_ecpm[iter][0])
rad_position1.append(rad_rank_by_ecpm[iter][1])
prob = {'url': url}
counts_first = 0.0
for rad in rad_position0:
if rad in rads_set:
counts_first += 1
prob['probability_position0'] = round(counts_first /
SIMULATION_ITERATIONS, 4)
counts_second = 0.0
for rad in rad_position1:
if rad in rads_set:
counts_second += 1
prob['probability_position1'] = round(counts_second /
SIMULATION_ITERATIONS, 4)
rad_prod_fw.write(json.dumps(prob) + "\n")
radrankings_with_bid_fw.close()
rad_prod_fw.close()
# part 5: predictions for impressions of the buyer campaign on seller urls
# These come from pig script:
click_count_fd = open(PIG_CLICK_COUNT_FILE)
# campaign_id, (seller) site_id, content_unit, position, clicks
click_count_info_list = []
for content in click_count_fd:
click_count_info_list.append([int(x) for x in content.split('\t')])
imp_count_fd = open(PIG_IMP_COUNT_FILE)
# campaign_id, (seller) site_id, content_unit, position, imps
imp_count_info_list = []
for content in imp_count_fd:
imp_count_info_list.append([int(x) for x in content.split('\t')])
# campaign_id, (seller) site_id, content_unit, position, CTR
_campaign_site_content_position_ctr = []
for _click in click_count_info_list:
for _imp in imp_count_info_list:
if (_click[0] == _imp[0] and _click[1] == _imp[1]
and _click[2] == _imp[2] and _click[3] == _imp[3]):
campaign_site_content_position_ctr = [
_imp[0], _imp[1], _imp[2], _imp[3], float(_click[4]) / _imp[4]]
_campaign_site_content_position_ctr.append(
campaign_site_content_position_ctr)
if not _campaign_site_content_position_ctr:
raise RuntimeError(
"The _campaign_site_content_position_ctr list is empty. "
"Please make sure %s and %s have intersections." % (
PIG_CLICK_COUNT_FILE, PIG_IMP_COUNT_FILE))
campaign_site_content_position_ctr = DataFrame(
_campaign_site_content_position_ctr,
columns=['campaign_id', 'site_id', 'content_unit', 'position', 'ctr'])
# this is just for simulation: targeted buyer_campaign on one seller_site
# site_id=3: livestrong; site_id=4: ehow; site_id=25: cracked
content_position_ctr = campaign_site_content_position_ctr[
(campaign_site_content_position_ctr.campaign_id == BUYER_CAMPAIGN_ID)
& (campaign_site_content_position_ctr.site_id == 3)]
host_url_imp_fd = open(PIG_HOST_URL_IMPS_FILE)
# site_id, host_url, cu_imps
_host_url_imp_info_list = []
for content in host_url_imp_fd:
content = content.rstrip()
_host_url_info_tuple = [x for x in content.split('\t')]
if len(_host_url_info_tuple) != 3:
print("Warning: %s (line is not properly tab delimited:'%s')" % (
PIG_HOST_URL_IMPS_FILE, content))
continue
site_id, host_url, cu_imps = _host_url_info_tuple
_host_url_imp_info_list.append((int(site_id), host_url, int(cu_imps)))
host_url_cu_imps = DataFrame(
_host_url_imp_info_list, columns=['site_id', 'url', 'cu_imps'])
_rad_position_probability_list = [] # url, prob_position0, prob_position1
for content in open(rad_position_probability_file):
record = json.loads(content)
list = [
str(record['url']),
record['probability_position0'],
record['probability_position1']
]
_rad_position_probability_list.append(list)
rad_position_probability_list = DataFrame(
_rad_position_probability_list,
columns=['url', 'prob_position0', 'prob_position1'])
# map url with positions
url_position = merge(
host_url_cu_imps,
rad_position_probability_list,
on='url',
how='inner')
num_processed_lines_part5 = 0
pred_imps = pred_clicks = 0 # TODO(ming): need to predict for clicks
var_imps = var_clicks = 0
# url_position contains: site_id, url, cu_imps, prob_position0, prob_position1
for idx in url_position.index:
cu_imps = url_position['cu_imps'][idx]
prob_position0 = url_position['prob_position0'][idx]
prob_position1 = url_position['prob_position1'][idx]
num_processed_lines_part5 += 1
if num_processed_lines_part5 % 1000 == 0:
print 'num_processed_lines_part5 = {}'.format(
num_processed_lines_part5)
pred_imps += cu_imps * (prob_position0 + prob_position1)
# use normal approximation for estimating the variance
var_imps += cu_imps * (
prob_position0 * (1 - prob_position0) +
prob_position1 * (1 - prob_position1))
pred = {}
pred['pred_imps'] = int(pred_imps / SAMPLE_PROBABILITY)
# 1.96 corresponds to z(standard normal)-value of 95% confidence level
# assuming the predicted imps is normally distributed
pred['pred_imps_upper'] = int((pred_imps + 1.96 * sqrt(var_imps)) /
SAMPLE_PROBABILITY)
pred['pred_imps_lower'] = int((pred_imps - 1.96 * sqrt(var_imps)) /
SAMPLE_PROBABILITY)
prediction_fw = open(DATA_PATH + '/_prediction.txt', 'w')
prediction_fw.write(json.dumps(pred) + "\n")
prediction_fw.close()
Saturday, November 24, 2012
Wednesday, October 24, 2012
[Python] Generating Histograms
import matplotlib.pyplot as plt

With the option 'normed=True', it will return the normalized histogram, i.e, probability distribution plot.
plt.hist(a,bins=10,normed=True)
plt.show()

With the option 'cumulative=True', it will return the cumulative distribution plot.
plt.hist(a,bins=10,normed=True,cumulative=True)
plt.show()

from numpy.random import normal
a=normal(10,2,size=1000)
plt.hist(a,bins=10)
plt.title("Gaussian Histogram")
plt.xlabel("Value")
plt.ylabel("Frequency")
plt.show()
With the option 'normed=True', it will return the normalized histogram, i.e, probability distribution plot.
plt.hist(a,bins=10,normed=True)
plt.show()
With the option 'cumulative=True', it will return the cumulative distribution plot.
plt.hist(a,bins=10,normed=True,cumulative=True)
plt.show()
[Pyhon] Data Analysis toolkit 'pandas'
pandas is well suited for many different kinds of data:
One example:
suppose there are two data sets and I’d like to do an inner join between these two tables, the generic coding logic might look like below:
data1= [['id1','Kevin'],['id2','John'],['id3','Mike']]
data2 = [['id1',31],['id2',28],['id3',34],]
join = []
for b in data2:
for a in data1:
if b[0]==a[0]:
list = [x for x in b]
list.append(a[1])
join.append(list)
for j in join:
print j
The output is as follows:
['id1', 31, 'Kevin']
['id2', 28, 'John']
['id3', 34, 'Mike']
With pandas, the code will be much shorter:
from pandas import *
new_data1=DataFrame(data1, columns = ['id','name'])
new_data2=DataFrame(data2, columns = ['id','age'])
join2 = merge(new_data1,new_data2, on = 'id', how='inner')
print join2
The output is as follows:
id name age
0 id1 Kevin 31
1 id2 John 28
2 id3 Mike 34
- Tabular data with heterogeneously-typed columns, as in an SQL table or Excel spreadsheet
- Ordered and unordered (not necessarily fixed-frequency) time series data.
- Arbitrary matrix data (homogeneously typed or heterogeneous) with row and column labels
- Any other form of observational / statistical data sets. The data actually need not be labeled at all to be placed into a pandas data structure
One example:
suppose there are two data sets and I’d like to do an inner join between these two tables, the generic coding logic might look like below:
data1= [['id1','Kevin'],['id2','John'],['id3','Mike']]
data2 = [['id1',31],['id2',28],['id3',34],]
join = []
for b in data2:
for a in data1:
if b[0]==a[0]:
list = [x for x in b]
list.append(a[1])
join.append(list)
for j in join:
print j
The output is as follows:
['id1', 31, 'Kevin']
['id2', 28, 'John']
['id3', 34, 'Mike']
With pandas, the code will be much shorter:
from pandas import *
new_data1=DataFrame(data1, columns = ['id','name'])
new_data2=DataFrame(data2, columns = ['id','age'])
join2 = merge(new_data1,new_data2, on = 'id', how='inner')
print join2
The output is as follows:
id name age
0 id1 Kevin 31
1 id2 John 28
2 id3 Mike 34
Tuesday, October 2, 2012
[Python] K-nearest Neighbor classification
In pattern recognition, the k-nearest neighbor algorithm (k-NN) is a method for classifying objects based on closest training examples in the feature space.
Python code:
import numpy as np
import matplotlib.pyplot as plt
import mlpy
np.random.seed(0)
mean1, cov1, n1 = [1, 5], [[1,1],[1,2]], 200 # 200 samples of class 1
x1 = np.random.multivariate_normal(mean1, cov1, n1)
y1 = np.ones(n1, dtype=np.int)
mean2, cov2, n2 = [2.5, 2.5], [[1,0],[0,1]], 300 # 300 samples of class 2
x2 = np.random.multivariate_normal(mean2, cov2, n2)
y2 = 2 * np.ones(n2, dtype=np.int)
mean3, cov3, n3 = [5, 8], [[0.5,0],[0,0.5]], 200 # 200 samples of class 3
x3 = np.random.multivariate_normal(mean3, cov3, n3)
y3 = 3 * np.ones(n3, dtype=np.int)
x = np.concatenate((x1, x2, x3), axis=0) # concatenate the samples
y = np.concatenate((y1, y2, y3))
knn = mlpy.KNN(k=3)
knn.learn(x, y)
knn.nclasses() # return the number of classes
xmin, xmax = x[:,0].min()-1, x[:,0].max()+1
ymin, ymax = x[:,1].min()-1, x[:,1].max()+1
xx, yy = np.meshgrid(np.arange(xmin, xmax, 0.1), np.arange(ymin, ymax, 0.1))
xnew = np.c_[xx.ravel(), yy.ravel()] # testing data
ynew = knn.pred(xnew).reshape(xx.shape) # Predict KNN model on a test point
ynew[ynew == 0] = 1 # set the samples with no unique classification to 1
fig = plt.figure(1)
cmap = plt.set_cmap(plt.cm.Paired)
plot1 = plt.pcolormesh(xx, yy, ynew) # plot the separated regions with different color
plot2 = plt.scatter(x[:,0], x[:,1], c=y) # plot the scatter plot of the data
plt.show()
- The training examples are vectors in a multidimensional feature space, each with a class label. The training phase of the algorithm consists only of storing the feature vectors and class labels of the training samples.
- In the classification phase, k is a user-defined constant, and an unlabeled vector (a query or test point) is classified by assigning the label which is most frequent among the k training samples nearest to that query point.
- Usually Euclidean distance is used as the distance metric; however this is only applicable to continuous variables. In cases such as text classification, another metric such as the overlap metric (or Hamming distance) can be used. Often, the classification accuracy of "k"-NN can be improved significantly if the distance metric is learned with specialized algorithms such as Large Margin Nearest Neighbor or Neighbourhood components analysis.
- The best choice of k depends upon the data; generally, larger values of k reduce the effect of noise on the classification, but make boundaries between classes less distinct. A good k can be selected by various heuristic techniques, for example, cross-validation (for example, choose the value of k by minimizing mis-classification rate). The special case where the class is predicted to be the class of the closest training sample (i.e. when k = 1) is called the nearest neighbor algorithm.
Python code:
import numpy as np
import matplotlib.pyplot as plt
import mlpy
np.random.seed(0)
mean1, cov1, n1 = [1, 5], [[1,1],[1,2]], 200 # 200 samples of class 1
x1 = np.random.multivariate_normal(mean1, cov1, n1)
y1 = np.ones(n1, dtype=np.int)
mean2, cov2, n2 = [2.5, 2.5], [[1,0],[0,1]], 300 # 300 samples of class 2
x2 = np.random.multivariate_normal(mean2, cov2, n2)
y2 = 2 * np.ones(n2, dtype=np.int)
mean3, cov3, n3 = [5, 8], [[0.5,0],[0,0.5]], 200 # 200 samples of class 3
x3 = np.random.multivariate_normal(mean3, cov3, n3)
y3 = 3 * np.ones(n3, dtype=np.int)
x = np.concatenate((x1, x2, x3), axis=0) # concatenate the samples
y = np.concatenate((y1, y2, y3))
knn = mlpy.KNN(k=3)
knn.learn(x, y)
knn.nclasses() # return the number of classes
xmin, xmax = x[:,0].min()-1, x[:,0].max()+1
ymin, ymax = x[:,1].min()-1, x[:,1].max()+1
xx, yy = np.meshgrid(np.arange(xmin, xmax, 0.1), np.arange(ymin, ymax, 0.1))
xnew = np.c_[xx.ravel(), yy.ravel()] # testing data
ynew = knn.pred(xnew).reshape(xx.shape) # Predict KNN model on a test point
ynew[ynew == 0] = 1 # set the samples with no unique classification to 1
fig = plt.figure(1)
cmap = plt.set_cmap(plt.cm.Paired)
plot1 = plt.pcolormesh(xx, yy, ynew) # plot the separated regions with different color
plot2 = plt.scatter(x[:,0], x[:,1], c=y) # plot the scatter plot of the data
plt.show()
Wednesday, September 26, 2012
[Python]Elastic Net Classifier
import numpy as np
import matplotlib.pyplot as plt
import mlpy
np.random.seed(0)
mean1, cov1, n1 = [1, 5], [[1,1],[1,2]], 200 # 200 samples of class 1
x1 = np.random.multivariate_normal(mean1, cov1, n1)
y1 = np.ones(n1, dtype=np.int)
mean2, cov2, n2 = [2.5, 2.5], [[1,0],[0,1]], 300 # 300 samples of class -1
x2 = np.random.multivariate_normal(mean2, cov2, n2)
y2 = -np.ones(n2, dtype=np.int)
x = np.concatenate((x1, x2), axis=0) # concatenate the samples
y = np.concatenate((y1, y2))
en = mlpy.ElasticNetC(lmb=0.01, eps=0.001)
en.learn(x, y)
w = en.w()
b = en.bias()
en.iters()
xx = np.arange(np.min(x[:,0]), np.max(x[:,0]), 0.01)
yy = - (w[0] * xx + b) / w[1] # separator line
fig = plt.figure(1) # plot
plot1 = plt.plot(x1[:, 0], x1[:, 1], 'ob', x2[:, 0], x2[:, 1], 'or')
plot2 = plt.plot(xx, yy, '--k')
plt.show()
import matplotlib.pyplot as plt
import mlpy
np.random.seed(0)
mean1, cov1, n1 = [1, 5], [[1,1],[1,2]], 200 # 200 samples of class 1
x1 = np.random.multivariate_normal(mean1, cov1, n1)
y1 = np.ones(n1, dtype=np.int)
mean2, cov2, n2 = [2.5, 2.5], [[1,0],[0,1]], 300 # 300 samples of class -1
x2 = np.random.multivariate_normal(mean2, cov2, n2)
y2 = -np.ones(n2, dtype=np.int)
x = np.concatenate((x1, x2), axis=0) # concatenate the samples
y = np.concatenate((y1, y2))
en = mlpy.ElasticNetC(lmb=0.01, eps=0.001)
en.learn(x, y)
w = en.w()
b = en.bias()
en.iters()
xx = np.arange(np.min(x[:,0]), np.max(x[:,0]), 0.01)
yy = - (w[0] * xx + b) / w[1] # separator line
fig = plt.figure(1) # plot
plot1 = plt.plot(x1[:, 0], x1[:, 1], 'ob', x2[:, 0], x2[:, 1], 'or')
plot2 = plt.plot(xx, yy, '--k')
plt.show()
Subscribe to:
Posts (Atom)