Thursday, August 16, 2012

[Pig] Use of conditional commands



--------------- page views
rawimps = LOAD 'imp' USING TextLoader AS line:chararray;

imp = FOREACH rawimps GENERATE JSON2MAP(line);
imp = FOREACH imp GENERATE JSON2MAP($0#'publisher_descriptor') as publisher, JSON2MAP($0#'user_descriptor') as user;

-- only includes LiveStrong (which has site_id=3)
imp = FILTER imp by publisher#'site_id'=='3';    
imp_user = FOREACH imp GENERATE (chararray) user#'uuid' as user_id;
imp_user = GROUP imp_user BY user_id parallel 5;
pv_user = FOREACH imp_user GENERATE $0 as user_id, COUNT($1) as pv;

--------------- clicks
rawclicks = LOAD 'click' USING TextLoader AS line:chararray;


click = FOREACH rawclicks GENERATE JSON2MAP(line);
click = FOREACH click GENERATE JSON2MAP($0#'publisher_descriptor') as publisher, JSON2MAP($0#'user_descriptor') as user, JSON2MAP($0#'clicked_rad') as clicked_rad;

-- only includes LiveStrong (which has site_id=3)
click_user = FILTER click BY publisher#'site_id'=='3';
click_user = FOREACH click GENERATE (chararray) user#'uuid' as user_id, (float) clicked_rad#'cost' as cost;
click_user = FOREACH click_user GENERATE user_id, cost, (cost==0.0? 'free':null) as free_click, (cost!=0.0? 'paid':null) as paid_click;
click_user = GROUP click_user by user_id parallel 5;

click_user = FOREACH click_user GENERATE $0 as user_id, COUNT($1.free_click) as free_click, COUNT($1.paid_click) as paid_click, SUM($1.cost) as total_cost;

pv_click_user = JOIN pv_user BY user_id LEFT OUTER, click_user BY user_id parallel 5;
pv_click_user = FOREACH pv_click_user GENERATE (chararray) $0 as user_id, (long) pv_user::pv as pv, (long) (click_user::free_click is null?0:click_user::free_click) as free_click, (long) (click_user::paid_click is null?0:click_user::paid_click) as paid_click, (double) (click_user::total_cost is null?0:click_user::total_cost) as total_cost;

user_group = GROUP pv_click_user ALL parallel 5;
user_group = FOREACH user_group GENERATE COUNT($1.user_id) as visitors, SUM($1.pv) as pv, SUM($1.free_click) as free_click, SUM($1.paid_click) as paid_click, SUM($1.total_cost) as revenue;

-- PV distribution
pv_distribution = GROUP pv_user BY pv parallel 5;
pv_distribution = FOREACH pv_distribution GENERATE $0 as pv, COUNT($1) as counts;



No comments:

Post a Comment