part of http://philip.greenspun.com/teaching/three-day-rdbms/
Update January 24, 2015
-- get rid of the old table from yesterday
drop table some_numbers;
-- we won't use "numbers" because it might be a reserved word in some DBMSes
-- e.g., "number" is a data type
create table some_numbers (
one_number integer not null primary key
);
insert into some_numbers (one_number)
values
(1), (2), (3), (4);
--- produces all possible combinations (4^4 = 256 rows)
select sn1.one_number, sn2.one_number, sn3.one_number, sn4.one_number
from
some_numbers as sn1, some_numbers as sn2, some_numbers as sn3, some_numbers as sn4;
-- let's limit the results
select
sn1.one_number, sn2.one_number, sn3.one_number, sn4.one_number
from
some_numbers as sn1, some_numbers as sn2, some_numbers as sn3, some_numbers as sn4
where
sn1.one_number not in (sn2.one_number, sn3.one_number, sn4.one_number)
and
sn2.one_number not in (sn1.one_number, sn3.one_number, sn4.one_number)
and
sn3.one_number not in (sn1.one_number, sn2.one_number, sn4.one_number)
and
sn4.one_number not in (sn1.one_number, sn2.one_number, sn3.one_number);
-- 24 results = 4! (according the "the Google")
-- alternately, you could do this using pairwise comparisons:
select * from
some_numbers n1
join some_numbers n2 on (n2.one_number <> n1.one_number)
join some_numbers n3 on (n3.one_number <> n1.one_number and n3.one_number <> n2.one_number)
join some_numbers n4 on (n4.one_number <> n1.one_number and n4.one_number <> n2.one_number and n4.one_number <> n3.one_number);
SELECT n_groups, COUNT(*)
FROM (SELECT user_id, COUNT(*) AS n_groups
FROM group_memberships
GROUP BY user_id) AS first_grouping
GROUP BY n_groups
ORDER BY n_groups;
Below, for the extra credit exercise, the use of COUNT(accepted), as opposed to COUNT(*), ensures that we only count established friendships for the histogram. Also, though u.user_id in the inner select is not used in the final report, it is useful for readability and debugging, e.g., if you were to run the inner query directly. The SQL optimizer might take this column out, since it is not used in the final report.
SELECT num_friends, count(num_friends) AS people_with_this_many_friends
FROM (SELECT u.user_id, COUNT(accepted) AS num_friends
FROM friendships f, users u
WHERE f.from_user_id=u.user_id OR f.to_user_id=u.user_id
GROUP BY u.user_id) AS members
GROUP BY num_friends
ORDER BY num_friends;
Note that the JOIN between friendships and users isn't obviously necessary. We aren't interested in the actual names of the users or anything else that is found strictly in the users table. It is simply a convenient way to get two rows for every friendship, one in each direction.
Solution: courtesy Kathleen Alexander.
select u.first_names, u.last_name, g.title, NULL
from users u, group_memberships gm, groups g
where u.user_id = gm.user_id
and gm.group_id = g.group_id
group by u.user_id
having count(*) = 1
select u.first_names, u.last_name, min(g.title), max(g.title)
from users u, group_memberships gm, groups g
where u.user_id = gm.user_id
and gm.group_id = g.group_id
group by u.user_id
having count(*) = 2
select u.first_names, u.last_name, min(g.title),
concat(count(*)-1, ' more')
from users u, group_memberships gm, groups g
where u.user_id = gm.user_id
and gm.group_id = g.group_id
group by u.user_id
having count(*) > 2
select u.first_names, u.last_name, g.title as group1, NULL as group2
from users u, group_memberships gm, groups g
where u.user_id = gm.user_id
and gm.group_id = g.group_id
group by u.user_id
having count(*) = 1
UNION
select u.first_names, u.last_name, min(g.title) as group1,
max(g.title) as group2
from users u, group_memberships gm, groups g
where u.user_id = gm.user_id
and gm.group_id = g.group_id
group by u.user_id
having count(*) = 2
UNION
select u.first_names, u.last_name, min(g.title) as group1,
concat(count(*)-1, ' more') as group2
from users u, group_memberships gm, groups g
where u.user_id = gm.user_id
and gm.group_id = g.group_id
group by u.user_id
having count(*) > 2
ORDER BY UPPER(last_name), UPPER(first_names);
note that you have to remove the "u." from the column names in the ORDER BY because at that point the users table is no longer visible. The column is named simply "last_name" for example, in the results.
Sarah Zenklusen shows us how to do this problem with the sort-of-new CASE feature of SQL and no UNION at all:
SELECT u.first_names, u.last_name, max(g.title) as group1,
CASE count(g.title)
WHEN 1 THEN NULL
WHEN 2 THEN min(g.title)
ELSE concat(count(gm.group_id)-1, ' more')
END as group2
FROM group_memberships as gm
JOIN users as u on gm.user_id = u.user_id
JOIN groups as g on gm.group_id = g.group_id
GROUP BY gm.user_id
Exercise: Intersection and Minus
As noted in the big hint, these work because we have only one value per row...
Using IN to find the intersection:
select distinct promise
from promises_made
where promise in (select promise from promises_kept);
Using NOT IN to do a minus:
select distinct promise
from promises_made
where promise not in (select promise from promises_kept);
Using JOIN to find the intersection:
select distinct pm.promise
from promises_made pm, promises_kept pk
where pm.promise = pk.promise;
Using OUTER JOIN to do a minus
select distinct pm.promise
from promises_made pm left outer join promises_kept pk
on (pm.promise = pk.promise)
where pk.promise is null
Extra credit: What if we had two or three values for each row? How could we do intersections and set differences then?
-- We join on all three columns x, y and z to get the intersection, and SELECT DISTINCT
-- to de-duplicate.
SELECT DISTINCT[a] c1.x,c1.y,c1.z
FROM coordinates c1
JOIN coordinates c2 ON (c1.x = c2.x AND c1.y = c2.y AND c1.z = c2.z)
WHERE c1.source = 'herb'
AND c2.source = 'larry'
ORDER BY x,y,z;
-- The same result is produced by grouping all of the columns and looking for tuples that are represented twice:
SELECT x,y,z
FROM coordinates
GROUP BY x,y,z
HAVING COUNT(*) = 2
ORDER BY x,y,z;
-- we thought this was going to be hard, but it turns out that the IN
-- operator works on sets too, at least in MySQL:
select x, y, z
from coordinates
where source = 'herb'
and (x,y,z) in (select x, y, z from coordinates where source = 'larry')
As above, we can in fact do this trivially with NOT IN:
select x, y, z
from coordinates
where source = 'larry'
and (x,y,z) not in (select x, y, z from coordinates where source = 'herb')
But maybe we want to do it painfully, so look for Larry's unique finds by OUTER JOINing with Herb's findings and filtering for non-matching rows:
SELECT l.x, l.y, l.z
FROM
(SELECT x, y, z
FROM coordinates
WHERE source = 'larry') AS l
LEFT OUTER JOIN
(SELECT x, y, z
FROM coordinates
WHERE source = 'herb') AS h
ON (l.x = h.x AND l.y = h.y AND l.z = h.z)
WHERE h.x IS NULL
ORDER BY l.x,l.y,l.z;
Can it be done without the inner SELECTs? How about:
SELECT l.x, l.y, l.z
FROM coordinates l
LEFT OUTER JOIN coordinates h ON (h.x = l.x AND h.y = l.y AND h.z = l.z)
WHERE l.source = 'larry'
AND h.source = 'herb'
AND h.x IS NULL
ORDER BY l.x,l.y,l.z;
When we run this, we find that 0 rows are returned. Where did we go wrong? By placing h.source = 'herb' in the WHERE clause, we've forced the query to return Herb's matches. We can correct this by moving the check for Herb up into the OUTER JOIN condition:
SELECT l.x, l.y, l.z
FROM coordinates l
LEFT OUTER JOIN coordinates h ON (h.source = 'herb' AND h.x = l.x AND h.y = l.y AND h.z = l.z)
WHERE l.source = 'larry'
AND h.x IS NULL
ORDER BY l.x,l.y,l.z;
Why did we leave the check for l.source = 'larry' in the WHERE clause? Couldn't we put that into the OUTER JOIN condition as well? No. Remember that we must start with all of Larry's coordinates, which we achieve by restricting l.source to 'larry' in the WHERE clause.
An INTERSECTION behaves like a JOIN with a SELECT DISTINCT. Suppose for example that you wanted a two-column report of users who have the same number of group memberships as they do friends. Start by constructing the queries for the two sets to be intersected, making sure that each returns the same columns in the same order.
Memberships:
SELECT m.user_id, COUNT(*) AS the_count
FROM group_memberships m
LEFT OUTER JOIN users u ON (u.user_id = m.user_id)
GROUP BY m.user_id;
Friendships:
SELECT u.user_id, COUNT(f.unique_key) AS the_count
FROM users u
LEFT OUTER JOIN friendships f ON (f.from_user_id = u.user_id OR f.to_user_id = u.user_id)
GROUP BY u.user_id;
Once you have these, create sub-queries from them, JOIN on all columns and do a SELECT DISTINCT to eliminate repeats:
SELECT DISTINCT membership_count.user_id, membership_count.the_count
FROM
(SELECT u.user_id, COUNT(m.user_id) AS the_count
FROM users u
LEFT OUTER JOIN group_memberships m ON (m.user_id = u.user_id)
GROUP BY u.user_id) AS membership_count
JOIN
(SELECT u.user_id, COUNT(f.unique_key) AS the_count
FROM users u
LEFT OUTER JOIN friendships f ON (f.from_user_id = u.user_id OR f.to_user_id = u.user_id)
GROUP BY u.user_id) AS friend_count
ON (membership_count.user_id = friend_count.user_id AND membership_count.the_count = friend_count.the_count);
The MINUS operation can be achieved with an OUTER JOIN, which includes rows from the joined set even if they do not match the join condition. When the joined set has no matching row, NULL is returned for its columns. By restricting the join to rows where one of these columns is NULL, we can find all rows that are in the first set but not the second.
Consider for example constructing a report that returns the user ids and names of all users who like the movie Memento and did not publish any status updates in the month of December, 2010.
Users who like Memento:
SELECT u.user_id, CONCAT(u.first_names, ' ', u.last_name) AS name
FROM user_like_map m
JOIN users u ON (u.user_id = m.user_id)
JOIN likes l ON (m.like_id = l.like_id)
WHERE l.topic = 'Memento';
Users who posted updates in December, 2010:
SELECT u.user_id, CONCAT(u.first_names, ' ', u.last_name) AS name
FROM status_updates s
JOIN users u ON (u.user_id = s.user_id)
WHERE DATE(s.creation_date) BETWEEN '2010-12-01' AND '2010-12-31' GROUP BY u.user_id;
Subtract the second set by OUTER JOINing and looking for rows in the first set with no matches in the second set:
SELECT memento_likers.name
FROM
(SELECT u.user_id, CONCAT(u.first_names, ' ', u.last_name) AS name
FROM user_like_map m
JOIN users u ON (u.user_id = m.user_id)
JOIN likes l ON (m.like_id = l.like_id)
WHERE l.topic = 'Memento') AS memento_likers
LEFT OUTER JOIN
(SELECT u.user_id, CONCAT(u.first_names, ' ', u.last_name) AS name
FROM status_updates s JOIN users u ON (u.user_id = s.user_id)
WHERE DATE(s.creation_date) BETWEEN '2010-12-01' AND '2010-12-31'
GROUP BY u.user_id) AS december_updaters
ON (december_updaters.user_id = memento_likers.user_id)
WHERE december_updaters.user_id IS NULL;
SELECT *
FROM all_invitations
WHERE expires > current_date OR expires IS NULL;
Philip is lazy. He starts from the proposition that this requires joining four tables: users (for the name of the user we're serving), friendship map (to figure out who the guy's friends are), friendship map (to figure out who those friends are friends with), and then users again (so that we can print the name of the suggested friend rather than simply a user ID). With four tables being joined we know that there are going to be three join conditions. Then we'll just need a couple of extra conditions to exclude existing friends and to restrict the report to just one user.
The complicating factor here is that a friendship is represented with only one row, though it applies to pairs of users. So the obvious query joining on only, say, the from_user_id of friendships, is going to miss approximately half of the friendships at each step. Instead of having our head explode by trying to figure out the appropriate where clauses, outer joins, etc., let's create a view in honor of Kathleen Alexander that doubles the number of rows in the friendships table, making sure that if Joe User has 10 friends there are in fact 10 rows in the table with Joe's user_id in the first column?
CREATE VIEW friendships_double AS
SELECT from_user_id as user_id, to_user_id as friend_id
FROM friendships
UNION
SELECT to_user_id as user_id, from_user_id as friend_id
FROM friendships;
After the view creation, the query is more or less as noted above.
select u1.first_names, u1.last_name, u2.first_names, u2.last_name
from users u1,
friendships_double fd1,
friendships_double fd2,
users u2
where u1.user_id = fd1.user_id
and fd1.friend_id = fd2.user_id
and fd2.friend_id = u2.user_id
and u2.user_id not in
(select fd3.friend_id
from friendships_double fd3
where fd3.user_id = u1.user_id)
and u2.user_id != u1.user_id
and u1.user_id = 1
The u2.user_id != u1.user_id was added after the initial tests revealed that in fact Joe User is necessarily always a friend of his friend's (since the friendships are bidirectional).
-------------------- Andrew's more painful narrative of SQL woe
First let's create a view to filter for just the accepted friendship requests:
CREATE OR REPLACE VIEW friendships_accepted AS
SELECT to_user_id, from_user_id FROM friendships WHERE accepted IS NOT NULL;
First, let's get all of user 1's friends (by id). We'll have to do this for two cases, one where our user is the from_user_id and another where our user is the to_user_id:
SELECT to_user_id as friend_id from friendships_accepted where from_user_id=1
UNION ALL
SELECT from_user_id as friend_id from friendships_accepted where to_user_id=1;
Let's ignore that asymmetry for a moment and do some joins to get our friends-of-friends (fofs). This is how you'd do it in a directed graph (i.e. where I follow you, but you might not follow me):
SELECT DISTINCT f2.to_user_id as fof_id
FROM friendships_accepted f1
JOIN friendships_accepted f2
ON (f1.from_user_id=1 AND f1.to_user_id=f2.from_user_id);
Let's annotate that with the names of our intermediate friend and the friend-of-a-friend:
SELECT DISTINCT
f1.to_user_id as friend_id,
CONCAT(u1.first_names,' ',u1.last_name) as friend_name,
f2.to_user_id as fof_id,
CONCAT(u2.first_names,' ',u2.last_name) as fof_name
FROM friendships_accepted f1
JOIN friendships_accepted f2
ON (f1.from_user_id=1 AND f1.to_user_id=f2.from_user_id)
JOIN users u1 ON (f1.to_user_id=u1.user_id)
JOIN users u2 ON (f2.to_user_id=u2.user_id);
(Note that the above version could return duplicate fofs; the DISTINCT keyword eliminates duplicate rows, but because our rows here include the intermediate friend, a fof might appear more than once if there are multiple paths to that fof via different friends.)
Now how do we handle our real case, where a friendship for user X may be recorded with X in either the from_user_id or the to_user_id column? We have to consider four cases:
In SQL, this is accomplished by UNIONing the four separate queries. This combines the results, filtering out duplicates so no DISTINCT keyword is required. It looks like this:
CREATE OR REPLACE VIEW friends_of_friends AS
-- case 1: our user is f1.from_user_id, and we join with f2.from_user_id
SELECT f2.to_user_id as fof_id
FROM friendships_accepted f1 JOIN friendships_accepted f2
ON (f1.from_user_id = 1 AND f1.to_user_id = f2.from_user_id)
-- case 2: our user is f1.from_user_id, and we join with f2.to_user_id
UNION SELECT f2.from_user_id as fof_id
FROM friendships_accepted f1 JOIN friendships_accepted f2
ON (f1.from_user_id = 1 AND f1.to_user_id = f2.to_user_id)
-- case 3: our user is f1.to_user_id, and we join with f2.from_user_id
UNION SELECT f2.to_user_id as fof_id
FROM friendships_accepted f1 JOIN friendships_accepted f2
ON (f1.to_user_id = 1 AND f1.from_user_id = f2.from_user_id)
-- case 4: our user is f1.to_user_id, and we join with f2.to_user_id
UNION SELECT f2.from_user_id as fof_id
FROM friendships_accepted f1 JOIN friendships_accepted f2
ON (f1.to_user_id = 1 AND f1.from_user_id = f2.to_user_id);
To filter out user 1's existing friends, along with user 1 him/herself, we can put that monstrous thing inside a subquery:
SELECT fof_id
FROM friends_of_friends
WHERE fof_id <> 1
AND fof_id NOT IN
(SELECT to_user_id FROM friendships_accepted WHERE from_user_id = 1
UNION
SELECT from_user_id FROM friendships_accepted WHERE to_user_id = 1);
Alternatively, we can use a LEFT JOIN to join these results with user 1's existing friends, then filter for rows that have a NULL value to show only those that don't have an existing friendship with user 1:
SELECT fof_id
FROM friends_of_friends
LEFT JOIN friendships_accepted fr1
ON (fof_id = fr1.from_user_id AND fr1.to_user_id = 1)
LEFT JOIN friendships_accepted fr2
ON (fof_id = fr2.to_user_id AND fr2.from_user_id = 1)
WHERE fof_id <> 1
AND fr1.from_user_id IS NULL
AND fr2.to_user_id IS NULL;
Note that in the above query we use aliases fr1 and fr2 for the friendships table; this makes it clear that we aren't referring to the aliases f1 and f2 in the subquery, but isn't strictly necessary. Table aliases in the subquery are only visible within the subquery; only the final results of the subquery--aliased as subq-- would be available in the top-level select statement.
-------------------- Shimon's "bidi" solution
CREATE OR REPLACE VIEW friendships_bidi AS
SELECT from_user_id as user_a, to_user_id as user_b, creation_date, accepted
FROM friendships
UNION ALL
SELECT to_user_id as user_a, from_user_id as user_b, creation_date, accepted
FROM friendships;
SELECT f2.to_user_id as fof_id
FROM friendships_bidi f1 JOIN friendships_bidi f2
ON (f1.from_user_id = 1 AND f1.to_user_id = f2.from_user_id)
In order to display the most recent status message, you will need to
TIP: If your changes generate a "Failed to get data" message and you are having trouble tracking down the error, you can write it out to one of the TextViews on the screen.
Tips for solving this exercise:
[a]Why do we need DISTINCT here? This query shouldn't introduce any duplication.—shimonrura
AG: True for the data provided, but we would get duplication if Larry or Herb had duplicate coordinates in their own reports.