Monday, August 20, 2012

[R] Merge Data Sets in R

Description:
merge(x, y, by = intersect(names(x), names(y)),
      by.x = by, by.y = by, all = FALSE, all.x = all, all.y = all,
      sort = TRUE, suffixes = c(".x",".y"),
      incomparables = NULL, ...)


Example:
authors <- data.frame(
    surname = I(c("Tukey", "Venables", "Tierney", "Ripley", "McNeil")),
    nationality = c("US", "Australia", "US", "UK", "Australia"),
    deceased = c("yes", rep("no", 4)))
books <- data.frame(
    name = I(c("Tukey", "Venables", "Tierney",
             "Ripley", "Ripley", "McNeil", "R Core")),
    title = c("Exploratory Data Analysis",
              "Modern Applied Statistics ...",
              "LISP-STAT",
              "Spatial Statistics", "Stochastic Simulation",
              "Interactive Data Analysis",
              "An Introduction to R"),
    other.author = c(NA, "Ripley", NA, NA, NA, NA,
                     "Venables & Smith"))
m <- merge(authors, books, by.x = "surname", by.y = "name")

Output:
(1) the data set 'authors':

(2) the data set 'books':

 (3) the data set 'm':












Thursday, August 16, 2012

[Pig] Use of conditional commands



--------------- page views
rawimps = LOAD 'imp' USING TextLoader AS line:chararray;

imp = FOREACH rawimps GENERATE JSON2MAP(line);
imp = FOREACH imp GENERATE JSON2MAP($0#'publisher_descriptor') as publisher, JSON2MAP($0#'user_descriptor') as user;

-- only includes LiveStrong (which has site_id=3)
imp = FILTER imp by publisher#'site_id'=='3';    
imp_user = FOREACH imp GENERATE (chararray) user#'uuid' as user_id;
imp_user = GROUP imp_user BY user_id parallel 5;
pv_user = FOREACH imp_user GENERATE $0 as user_id, COUNT($1) as pv;

--------------- clicks
rawclicks = LOAD 'click' USING TextLoader AS line:chararray;


click = FOREACH rawclicks GENERATE JSON2MAP(line);
click = FOREACH click GENERATE JSON2MAP($0#'publisher_descriptor') as publisher, JSON2MAP($0#'user_descriptor') as user, JSON2MAP($0#'clicked_rad') as clicked_rad;

-- only includes LiveStrong (which has site_id=3)
click_user = FILTER click BY publisher#'site_id'=='3';
click_user = FOREACH click GENERATE (chararray) user#'uuid' as user_id, (float) clicked_rad#'cost' as cost;
click_user = FOREACH click_user GENERATE user_id, cost, (cost==0.0? 'free':null) as free_click, (cost!=0.0? 'paid':null) as paid_click;
click_user = GROUP click_user by user_id parallel 5;

click_user = FOREACH click_user GENERATE $0 as user_id, COUNT($1.free_click) as free_click, COUNT($1.paid_click) as paid_click, SUM($1.cost) as total_cost;

pv_click_user = JOIN pv_user BY user_id LEFT OUTER, click_user BY user_id parallel 5;
pv_click_user = FOREACH pv_click_user GENERATE (chararray) $0 as user_id, (long) pv_user::pv as pv, (long) (click_user::free_click is null?0:click_user::free_click) as free_click, (long) (click_user::paid_click is null?0:click_user::paid_click) as paid_click, (double) (click_user::total_cost is null?0:click_user::total_cost) as total_cost;

user_group = GROUP pv_click_user ALL parallel 5;
user_group = FOREACH user_group GENERATE COUNT($1.user_id) as visitors, SUM($1.pv) as pv, SUM($1.free_click) as free_click, SUM($1.paid_click) as paid_click, SUM($1.total_cost) as revenue;

-- PV distribution
pv_distribution = GROUP pv_user BY pv parallel 5;
pv_distribution = FOREACH pv_distribution GENERATE $0 as pv, COUNT($1) as counts;



Monday, August 13, 2012

[Python] Fibonacci series

# fill in this function
def fib():
    a,b=1,1
    for i in xrange(100):
        yield a
        a,b=b,a+b
    pass #this is a null statement which does nothing when executed, useful as a placeholder.

# testing code
import types
if type(fib()) == types.GeneratorType:
    print "Good, The fib function is a generator."
    counter = 0
    for n in fib():
        print n
        counter += 1
        if counter == 10:
            break



## output: 
Good, The fib function is a generator.
1
1
2
3
5
8
13
21
34
55