Description:
merge(x, y, by = intersect(names(x), names(y)),
by.x = by, by.y = by, all = FALSE, all.x = all, all.y = all,
sort = TRUE, suffixes = c(".x",".y"),
incomparables = NULL, ...)
Example:
authors <- data.frame(
surname = I(c("Tukey", "Venables", "Tierney", "Ripley", "McNeil")),
nationality = c("US", "Australia", "US", "UK", "Australia"),
deceased = c("yes", rep("no", 4)))
books <- data.frame(
name = I(c("Tukey", "Venables", "Tierney",
"Ripley", "Ripley", "McNeil", "R Core")),
title = c("Exploratory Data Analysis",
"Modern Applied Statistics ...",
"LISP-STAT",
"Spatial Statistics", "Stochastic Simulation",
"Interactive Data Analysis",
"An Introduction to R"),
other.author = c(NA, "Ripley", NA, NA, NA, NA,
"Venables & Smith"))
m <- merge(authors, books, by.x = "surname", by.y = "name")
Output:
(1) the data set 'authors':
(2) the data set 'books':
(3) the data set 'm':
Monday, August 20, 2012
Thursday, August 16, 2012
[Pig] Use of conditional commands
--------------- page views
rawimps = LOAD 'imp' USING TextLoader AS line:chararray;
imp = FOREACH rawimps GENERATE JSON2MAP(line);
imp = FOREACH imp GENERATE JSON2MAP($0#'publisher_descriptor') as publisher, JSON2MAP($0#'user_descriptor') as user;
-- only includes LiveStrong (which has site_id=3)
imp = FILTER imp by publisher#'site_id'=='3';
imp_user = FOREACH imp GENERATE (chararray) user#'uuid' as user_id;
imp_user = GROUP imp_user BY user_id parallel 5;
pv_user = FOREACH imp_user GENERATE $0 as user_id, COUNT($1) as pv;
--------------- clicks
rawclicks = LOAD 'click' USING TextLoader AS line:chararray;
click = FOREACH rawclicks GENERATE JSON2MAP(line);
click = FOREACH click GENERATE JSON2MAP($0#'publisher_descriptor') as publisher, JSON2MAP($0#'user_descriptor') as user, JSON2MAP($0#'clicked_rad') as clicked_rad;
-- only includes LiveStrong (which has site_id=3)
click_user = FILTER click BY publisher#'site_id'=='3';
click_user = FOREACH click GENERATE (chararray) user#'uuid' as user_id, (float) clicked_rad#'cost' as cost;
click_user = FOREACH click_user GENERATE user_id, cost, (cost==0.0? 'free':null) as free_click, (cost!=0.0? 'paid':null) as paid_click;
click_user = GROUP click_user by user_id parallel 5;
click_user = FOREACH click_user GENERATE $0 as user_id, COUNT($1.free_click) as free_click, COUNT($1.paid_click) as paid_click, SUM($1.cost) as total_cost;
pv_click_user = JOIN pv_user BY user_id LEFT OUTER, click_user BY user_id parallel 5;
pv_click_user = FOREACH pv_click_user GENERATE (chararray) $0 as user_id, (long) pv_user::pv as pv, (long) (click_user::free_click is null?0:click_user::free_click) as free_click, (long) (click_user::paid_click is null?0:click_user::paid_click) as paid_click, (double) (click_user::total_cost is null?0:click_user::total_cost) as total_cost;
user_group = GROUP pv_click_user ALL parallel 5;
user_group = FOREACH user_group GENERATE COUNT($1.user_id) as visitors, SUM($1.pv) as pv, SUM($1.free_click) as free_click, SUM($1.paid_click) as paid_click, SUM($1.total_cost) as revenue;
-- PV distribution
pv_distribution = GROUP pv_user BY pv parallel 5;
pv_distribution = FOREACH pv_distribution GENERATE $0 as pv, COUNT($1) as counts;
Monday, August 13, 2012
[Python] Fibonacci series
# fill in this function
def fib():
a,b=1,1
for i in xrange(100):
yield a
a,b=b,a+b
pass #this is a null statement which does nothing when executed, useful as a placeholder.
# testing code
import types
if type(fib()) == types.GeneratorType:
print "Good, The fib function is a generator."
counter = 0
for n in fib():
print n
counter += 1
if counter == 10:
break
## output:
Good, The fib function is a generator.
1
1
2
3
5
8
13
21
34
55
def fib():
a,b=1,1
for i in xrange(100):
yield a
a,b=b,a+b
pass #this is a null statement which does nothing when executed, useful as a placeholder.
# testing code
import types
if type(fib()) == types.GeneratorType:
print "Good, The fib function is a generator."
counter = 0
for n in fib():
print n
counter += 1
if counter == 10:
break
## output:
Good, The fib function is a generator.
1
1
2
3
5
8
13
21
34
55
Subscribe to:
Posts (Atom)