part of http://philip.greenspun.com/teaching/three-day-rdbms/
Updated January 24, 2015
select u1.user_id, concat(u1.first_names, ' ', u1.last_name) as name1, u1.email,
u2.user_id, concat(u2.first_names, ' ', u2.last_name) as name2, u2.email
from users u1, users u2
where upper(u1.last_name) = upper(u2.last_name)
and u1.user_id <> u2.user_id
and upper(left(u1.first_names,1)) = upper(left(u2.first_names,1))
order by u1.email
limit 10;
-- does Census data match IRS data?
select avg(age), count(*) from census_cps where receiving_alimony =1;
-- where do men receiving alimony live?
select state, age, annual_wages, annual_alimony, round(100*annual_alimony/(annual_wages+annual_alimony))
from census_cps
where receiving_alimony = 1
and sex = "M";
select sum(annual_alimony)*3000 from census_cps;
select sum(annual_child_support)*3000 from census_cps;
--- table of counts no child support/child support by state
select state, receiving_child_support, count(*)
from census_cps
where age between 30 and 40
and sex = "M"
group by state, receiving_child_support
order by state, receiving_child_support;
--- above is a failure because some states don't have any men receiving child support
select state, receiving_child_support, count(*)
from census_cps
where age between 30 and 40
and sex = "F"
group by state, receiving_child_support
order by state, receiving_child_support;
--- fix the NULL problem that was giving us three rows per state
select state, receiving_child_support, count(*)
from census_cps
where age between 30 and 40
and sex = "F"
where receiving_child_support is not null
group by state, receiving_child_support
order by state, receiving_child_support;
--- Question 4: percentage of people of one gender receiving child support?
SELECT state, count(receiving_child_support)*100.0/count(1) AS percentage
FROM
(SELECT state, IF(receiving_child_support=1, 1, NULL) AS receiving_child_support
FROM census_cps WHERE age BETWEEN 30 AND 40 AND sex="M") t
GROUP BY state
ORDER BY percentage desc;
--- Question 5: average wages per state
select state, count(*) as number_in_sample, round(avg(annual_wages))
from census_cps
where annual_wages > 0
and weekly_hours >= 30
group by state
order by state;
-- Question 6: median wage for one state
select count(*)
from census_cps
WHERE annual_wages > 0
AND age BETWEEN 22 AND 36
AND education_level = "Bachelor's"
AND weekly_hours >= 30
AND state = "Massachusetts";
-- Question 7: median wage for one state in one query
create view census_cps_massachusetts_subgroup
as
select * from census_cps
WHERE annual_wages > 0
AND age BETWEEN 22 AND 36
AND education_level = "Bachelor's"
AND weekly_hours >= 30
AND state = "Massachusetts";
SELECT x.annual_wages
from census_cps_massachusetts_subgroup x, census_cps_massachusetts_subgroup y
GROUP BY x.annual_wages
HAVING SUM(SIGN(1-SIGN(y.annual_wages-x.annual_wages)))/COUNT(*) > .5
LIMIT 1
Please write a query that will show, for 'moe', the time intervals in days between his visits (this report should have N-1 rows where N is the number of visits by 'moe') along with the date-times of the two visits that generated that interval, ordered by ascending date.
select v1.visit_time as start_time, v2.visit_time as end_time,
(unix_timestamp(v2.visit_time)-
unix_timestamp(v1.visit_time))/(60*60*24) as duration
from ts_visits v1, ts_visits v2, ts_users
where v1.visit_time = (select max(v3.visit_time)
from ts_visits v3
where v3.user_id = v1.user_id
and v3.visit_time < v2.visit_time)
and v1.user_id = v2.user_id
and v1.user_id = ts_users.user_id
and name='moe';
You can also achieve the same result using aggregate functions instead of a subquery:
select v1.visit_time as start_time,
min(v2.visit_time) as end_time,
(unix_timestamp(min(v2.visit_time)) -
unix_timestamp(v1.visit_time))/(60*60*24) as duration
from ts_visits v1
join ts_visits v2 on (v2.user_id = v1.user_id
and v2.visit_time > v1.visit_time)
join ts_users u on (u.user_id = v1.user_id) where u.name = 'moe'
group by v1.visit_id, v1.visit_time
order by v1.visit_time;
Performance note: the mysql client program gives you a quick summary of how long each query took, e.g.
39 rows in set (0.27 sec)
Which approach is faster, the subquery or group by? Why?
Please write a query that gives a report of the average interval between visits for each user and then an overall average for the site of time between visits for a particular user.
select user_id,
avg(unix_timestamp(visit_time)-
unix_timestamp((select max(visit_time) from
ts_visits where visit_time<tsv.visit_time and
user_id=tsv.user_id)))/(60*60*24) as visit_interval
from ts_visits tsv group by user_id with rollup;
Now let's have a report with three columns:
Hint: This can be done with GROUP BY, HAVING, MIN, MAX and UNION or with the CASE statement.
Philip's solution:
select name, min(visit_time), max(visit_time)
from ts_users, ts_visits
where ts_users.user_id = ts_visits.user_id
group by ts_users.user_id
having count(*) = 2
UNION
select name, min(visit_time), concat(count(visit_time)-1, " more")
from ts_users, ts_visits
where ts_users.user_id = ts_visits.user_id
group by ts_users.user_id
having count(*) > 2
ORDER BY name;
Note that the ORDER BY applies to the entire set.
Alternative: let's try this with CASE
select
name,
min(visit_time) as first,
CASE count(*)
WHEN 1 THEN NULL
WHEN 2 THEN max(visit_time)
ELSE count(*)-1 END as last
from ts_users, ts_visits
where ts_users.user_id = ts_visits.user_id
group by ts_users.user_id
ORDER BY name;