Data Engineering
SQL
1
Next: Doing more with Select From Where
Basic Form:
SELECT attributes
FROM tables
WHERE condition about tuples in tables
2
Extending SELECT
SELECT can employ:
id | race | gender | age | warning | citation | arrest |
17213 | asian | F | 23 | False | True | False |
1 | white | M | 45 | True | False | False |
2 | black | M | 37 | False | False | False |
19 | hispanic | F | 58 | False | True | False |
SELECT id AS number, arrest AS arrested FROM Stops
Extending SELECT
SELECT can employ:
id | race | gender | age | warning | citation | arrest |
17213 | asian | F | 23 | False | True | False |
1 | white | M | 45 | True | False | False |
2 | black | M | 37 | False | False | False |
19 | hispanic | F | 58 | False | True | False |
SELECT id, age/10 AS ageDecade, arrest
FROM Stops
Extending SELECT
SELECT can employ:
id | race | gender | age | warning | citation | arrest |
17213 | asian | F | 23 | False | True | False |
1 | white | M | 45 | True | False | False |
2 | black | M | 37 | False | False | False |
19 | hispanic | F | 58 | False | True | False |
SELECT id, CASE
WHEN age > 60 THEN ‘senior’
WHEN age <=60 AND age > 20 THEN ‘adult’
ELSE ‘juvenile’
END AS age_type,
arrest FROM Stops
Extending WHERE
WHERE can employ:
id | race | gender | age | warning | citation | arrest |
17213 | asian | F | 23 | False | True | False |
1 | white | M | 45 | True | False | False |
2 | black | M | 37 | False | False | False |
19 | hispanic | F | 58 | False | True | False |
SELECT * FROM Stops
WHERE gender != “M” AND age > 30
Extending WHERE
WHERE can employ:
id | race | gender | age | warning | citation | arrest |
17213 | asian | F | 23 | False | True | False |
1 | white | M | 45 | True | False | False |
2 | black | M | 37 | False | False | False |
19 | hispanic | F | 58 | False | True | False |
SELECT * FROM Stops
WHERE race LIKE ‘%a%’
How to Read a SQL Query
8
Recap: The Basic Form: Select From Where
Basic Form:
SELECT attributes
FROM tables
WHERE condition about tuples in tables
9
Recap: Multi-relation Queries
10
id | race | location | age | warning | citation | arrest |
17213 | asian | MacArthur | 23 | False | True | False |
1 | white | West Oakland | 45 | True | False | False |
2 | black | West Oakland | 37 | False | False | False |
19 | hispanic | Civic Center | 58 | False | True | False |
location | zipcode |
MacArthur | 94621 |
West Oakland | 94609 |
Civic Center | 94612 |
SELECT * FROM Stops, Zips WHERE Stops.location = Zips.location
SELECT * FROM Stops AS S, Zips AS Z WHERE S.location = Z.location
SELECT * FROM Stops S, Zips Z WHERE S.location = Z.location
If only race and zipcode are desired:
SELECT race, zipcode FROM Stops AS S, Zips AS Z WHERE S.location = Z.location
Recap: Reading a SQL Query w/ Mult. Tables
11
Recap
12
Subqueries
13
Subquery as a Scalar
14
id | race | location | arrest |
17213 | asian | MacArthur | False |
1 | white | West Oakland | False |
2 | black | West Oakland | False |
19 | hispanic | Civic Center | False |
Subquery as a Scalar
SELECT S1.id, S1.race, S1.location, S1.arrest
FROM Stops as S1, Stops as S2
WHERE S1.location = S2.location AND S2.id = 123
This is known as a self-join – two (or more) copies of the same relation
15
id | race | location | arrest |
17213 | asian | MacArthur | False |
1 | white | West Oakland | False |
2 | black | West Oakland | False |
19 | hispanic | Civic Center | False |
Subquery as a Scalar
SELECT S1.id, S1.race, S1.location, S1.arrest
FROM Stops S1
WHERE S1.location = (SELECT S2.location FROM Stops S2 WHERE S2.id = 123)
16
id | race | location | arrest |
17213 | asian | MacArthur | False |
1 | white | West Oakland | False |
2 | black | West Oakland | False |
19 | hispanic | Civic Center | False |
Subqueries
17
Subquery as a Set: EXISTS
SELECT Stops.location FROM Stops
WHERE NOT EXISTS
(SELECT * FROM Zips WHERE Zips.location = Stops.location)
18
id | race | location | arrest |
17213 | asian | MacArthur | False |
1 | white | West Oakland | False |
2 | black | West Oakland | False |
19 | hispanic | Civic Center | False |
location | zipcode |
MacArthur | 94621 |
West Oakland | 94609 |
Civic Center | 94612 |
EXISTS Exercise
Determine all the Stops that are the only one in their zipcode
19
id | race | location | arrest |
17213 | asian | MacArthur | False |
1 | white | West Oakland | False |
2 | black | West Oakland | False |
19 | hispanic | Civic Center | False |
location | zipcode |
MacArthur | 94621 |
West Oakland | 94609 |
Civic Center | 94612 |
SELECT * FROM Stops S1 NATURAL JOIN Zips Z1
WHERE NOT EXISTS
(SELECT * FROM Stops S2 NATURAL JOIN Zips Z2 WHERE Z1.zipcode = Z2.zipcode AND S1.id != S2.id)
Subqueries
20
Recap
21
Multiset/bag Operations
(SELECT location FROM Stops)
EXCEPT ALL
(SELECT location FROM Zips)
22
Multiset/bag Operations
23
Recap
24
Aggregations
25
Removing Duplicates
26
Detour: Null Values
27
id | race | location | age | arrest |
17213 | asian | MacArthur | 23 | False |
1 | white | West Oakland | NULL | False |
2 | black | NULL | 37 | False |
19 | NULL | Civic Center | 58 | False |
NULLs in Aggregation
28
id | race | location | age | arrest |
17213 | asian | MacArthur | 23 | False |
1 | white | West Oakland | NULL | False |
2 | black | NULL | 37 | False |
19 | NULL | Civic Center | 58 | False |
Grouping
SELECT location, AVG(age) AS avgage, MIN (age) as minage
FROM Stops GROUP BY location;
29
id | race | location | age | arrest |
17213 | asian | MacArthur | 23 | False |
1 | white | West Oakland | NULL | False |
2 | black | NULL | 37 | False |
19 | NULL | Civic Center | 58 | False |
Visualizing Grouping
SELECT location, AVG(age) AS avgage, MIN (age) as minage
FROM Stops GROUP BY location;
30
id | loc | age |
… | MacArthur | … |
… | West Oakland | … |
… | MacArthur | … |
… | West Oakland | … |
… | Civic Center | … |
id | loc | age |
… | MacArthur | … |
… | MacArthur | … |
id | loc | age |
… | West Oakland | … |
… | West Oakland | … |
id | loc | age |
… | Civic Center | … |
loc | avgage | minage |
MacArthur | … | … |
West Oakland | … | … |
Civic Center | … | … |
Grouping (contd.)
31
id | race | location | arrest |
17213 | asian | MacArthur | False |
1 | white | West Oakland | False |
2 | black | West Oakland | False |
19 | hispanic | Civic Center | False |
location | zipcode |
MacArthur | 94621 |
West Oakland | 94609 |
Civic Center | 94612 |
Grouping (contd.)
SELECT zipcode, COUNT (*)
FROM Stops, Zips
WHERE Stops.location = Zips.location
GROUP BY zipcode
32
id | race | location | arrest |
17213 | asian | MacArthur | False |
1 | white | West Oakland | False |
2 | black | West Oakland | False |
19 | hispanic | Civic Center | False |
location | zipcode |
MacArthur | 94621 |
West Oakland | 94609 |
Civic Center | 94612 |
Restriction of SELECT list with aggregation
33
Exercise
34
id | race | location | age | arrest |
17213 | asian | MacArthur | 23 | False |
1 | white | West Oakland | NULL | False |
2 | black | NULL | 37 | False |
19 | NULL | Civic Center | 58 | False |
Exercise
SELECT s.id FROM Stops
WHERE s.age >=
(SELECT MAX(s2.age) FROM Stops)
35
id | race | location | age | arrest |
17213 | asian | MacArthur | 23 | False |
1 | white | West Oakland | NULL | False |
2 | black | NULL | 37 | False |
19 | NULL | Civic Center | 58 | False |
Exercise
Exercise
SELECT race,
AVG (CASE WHEN location = ‘West Oakland’ THEN age ELSE NULL END) AS west_oakland_avg,
AVG (CASE WHEN location = ‘Rockridge’ THEN age ELSE NULL END) AS rockridge_avg
FROM Stops GROUP BY Race
Look up FILTERS in Aggregates for an alternate syntax
Sophisticated Grouping: Percentiles
HAVING Clauses
SELECT location, COUNT (*)
FROM Stops
GROUP BY location
HAVING COUNT (*) > 30
39
Restrictions on HAVING Clauses
40
General Form of GROUPING
SELECT S
FROM R1, R2, …
WHERE C1
GROUP BY A1, A2, …
HAVING C2
41
General Form of GROUPING
SELECT S
FROM R1, R2, …
WHERE C1
GROUP BY A1, A2, …
HAVING C2
Order of evaluation:
42
First
Second
Third
Fourth
Fifth
Recap
43
Ordering: ORDER BY
SELECT * FROM Stops
ORDER BY age DESC;
SELECT * FROM Stops
ORDER BY age DESC, id ASC;
ASC can be omitted – ascending by default
44
Restrict output: LIMIT & OFFSET
SELECT *
FROM Stops
ORDER BY age DESC LIMIT 15;
SELECT *
FROM Stops
ORDER BY age DESC LIMIT 5 OFFSET 10
45