Three-Day RDBMS Day 3, Session 1 Solutions

part of http://philip.greenspun.com/teaching/three-day-rdbms/

Updated January 24, 2015

Exercise: de-duplication

select u1.user_id, concat(u1.first_names, ' ', u1.last_name) as name1, u1.email,

           u2.user_id, concat(u2.first_names, ' ', u2.last_name) as name2, u2.email

from users u1, users u2

where upper(u1.last_name) = upper(u2.last_name)

and u1.user_id <> u2.user_id

and upper(left(u1.first_names,1)) = upper(left(u2.first_names,1))

order by u1.email

limit 10;

Census CPS

-- does Census data match IRS data?

select avg(age), count(*) from census_cps where receiving_alimony =1;

-- where do men receiving alimony live?

select state, age, annual_wages, annual_alimony, round(100*annual_alimony/(annual_wages+annual_alimony))

from census_cps

where receiving_alimony = 1

and sex = "M";

select sum(annual_alimony)*3000 from census_cps;

select sum(annual_child_support)*3000 from census_cps;

--- table of counts no child support/child support by state

select state, receiving_child_support, count(*)

from census_cps

where age between 30 and 40

and sex = "M"

group by state, receiving_child_support

order by state, receiving_child_support;

--- above is a failure because some states don't have any men receiving child support

select state, receiving_child_support, count(*)

from census_cps

where age between 30 and 40

and sex = "F"

group by state, receiving_child_support

order by state, receiving_child_support;

--- fix the NULL problem that was giving us three rows per state

select state, receiving_child_support, count(*)

from census_cps

where age between 30 and 40

and sex = "F"

where receiving_child_support is not null

group by state, receiving_child_support

order by state, receiving_child_support;

--- Question 4: percentage of people of one gender receiving child support?

SELECT state, count(receiving_child_support)*100.0/count(1) AS percentage

FROM

    (SELECT state, IF(receiving_child_support=1, 1, NULL) AS receiving_child_support

        FROM census_cps WHERE age BETWEEN 30 AND 40 AND sex="M") t

GROUP BY state

ORDER BY percentage desc;

--- Question 5: average wages per state

select state, count(*) as number_in_sample, round(avg(annual_wages))

from census_cps

where annual_wages > 0

and weekly_hours >= 30

group by state

order by state;

-- Question 6: median wage for one state

select count(*)

from census_cps

WHERE annual_wages > 0

AND age BETWEEN 22 AND 36

AND education_level = "Bachelor's"

AND weekly_hours >= 30

AND state = "Massachusetts";

-- Question 7: median wage for one state in one query

create view census_cps_massachusetts_subgroup

as

select * from census_cps

WHERE annual_wages > 0

AND age BETWEEN 22 AND 36

AND education_level = "Bachelor's"

AND weekly_hours >= 30

AND state = "Massachusetts";

SELECT x.annual_wages

from census_cps_massachusetts_subgroup x, census_cps_massachusetts_subgroup y

GROUP BY x.annual_wages

HAVING SUM(SIGN(1-SIGN(y.annual_wages-x.annual_wages)))/COUNT(*) > .5

LIMIT 1

Exercise: analyzing visit intervals (time series data)

Please write a query that will show, for 'moe', the time intervals in days between his visits (this report should have N-1 rows where N is the number of visits by 'moe') along with the date-times of the two visits that generated that interval, ordered by ascending date.

select v1.visit_time as start_time, v2.visit_time as end_time,

(unix_timestamp(v2.visit_time)-

unix_timestamp(v1.visit_time))/(60*60*24) as duration

from ts_visits v1, ts_visits v2, ts_users

where v1.visit_time =        (select max(v3.visit_time)

from ts_visits v3

where v3.user_id = v1.user_id

and v3.visit_time < v2.visit_time)

and v1.user_id = v2.user_id

and v1.user_id = ts_users.user_id

and name='moe';

You can also achieve the same result using aggregate functions instead of a subquery:

select v1.visit_time as start_time,

       min(v2.visit_time) as end_time,

       (unix_timestamp(min(v2.visit_time)) -

        unix_timestamp(v1.visit_time))/(60*60*24) as duration

from ts_visits v1

join ts_visits v2 on (v2.user_id = v1.user_id

                      and v2.visit_time > v1.visit_time)

join ts_users u on (u.user_id = v1.user_id) where u.name = 'moe'

group by v1.visit_id, v1.visit_time

order by v1.visit_time;

Performance note: the mysql client program gives you a quick summary of how long each query took, e.g.

39 rows in set (0.27 sec)

Which approach is faster, the subquery or group by? Why?                                                                                                                                     

Please write a query that gives a report of the average interval between visits for each user and then an overall average for the site of time between visits for a particular user.

select user_id,

avg(unix_timestamp(visit_time)-

unix_timestamp((select max(visit_time) from

ts_visits where visit_time<tsv.visit_time and

user_id=tsv.user_id)))/(60*60*24) as visit_interval

from ts_visits tsv group by user_id with rollup;

Now let's have a report with three columns:

Hint: This can be done with GROUP BY, HAVING, MIN, MAX and UNION or with the CASE statement.

Philip's solution:

select name, min(visit_time), max(visit_time)

from ts_users, ts_visits

where ts_users.user_id = ts_visits.user_id

group by ts_users.user_id

having count(*) = 2

UNION

select name, min(visit_time), concat(count(visit_time)-1, " more")

from ts_users, ts_visits

where ts_users.user_id = ts_visits.user_id

group by ts_users.user_id

having count(*) > 2

ORDER BY name;

Note that the ORDER BY applies to the entire set.

Alternative: let's try this with CASE

select

  name,

  min(visit_time) as first,

  CASE count(*)

WHEN 1 THEN NULL

WHEN 2 THEN max(visit_time)

ELSE count(*)-1 END as last

from ts_users, ts_visits

where ts_users.user_id = ts_visits.user_id

group by ts_users.user_id

ORDER BY name;