CREATE TABLE payroll ( user_id INT PRIMARY KEY, name TEXT, job TEXT, salary INT ); INSERT INTO payroll VALUES (123, 'Jack', 'TA', 50000), (345, 'Allison', 'TA', 60000), (567, 'Magda', 'Prof', 90000), (789, 'Dan', 'Prof', 100000); CREATE TABLE regist ( user_id INT REFERENCES payroll, car TEXT ); INSERT INTO regist VALUES (123, 'Charger'), (567, 'Civic'), (567, 'Pinto'); -------------------------- -- SELECT count(*) as c FROM payroll WHERE job = 'TA'; SELECT count(*) as c, avg(salary) as a FROM payroll WHERE job = 'TA'; SELECT job, count(*) FROM payroll GROUP BY job; SELECT job, count(*) FROM payroll WHERE salary > 55000 GROUP BY job; SELECT job, count(*) FROM payroll WHERE salary > 75000 GROUP BY job; -------------------------- -- Empty groups SELECT P.name, count(*) FROM payroll P, regist R WHERE P.user_id = R.user_id GROUP by P.user_id, P.name; SELECT P.name, count(*) FROM payroll P LEFT OUTER JOIN regist R ON P.user_id = R.user_id GROUP by P.user_id, P.name; SELECT P.name, count(R.car) FROM payroll P LEFT OUTER JOIN regist R ON P.user_id = R.user_id GROUP by P.user_id, P.name; SELECT P1.job, count(DISTINCT P2.user_id) FROM payroll P1 LEFT OUTER JOIN payroll P2 ON P1.job = P2.job and P2.salary > 75000 GROUP BY P1.job; -- we must use P2.salary > 75000! -- using P1.salary > 75000 is incorrect -- to see why, let's add a 3rd job, with one salary <75000 and one >75000: INSERT INTO payroll VALUES (222, 'Joe', 'Lec', 50000), (333, 'Jim', 'Lec', 90000); SELECT * FROM payroll; -- for the group 'Lec' we expect count = 1 -- but using P1.salary>75000, we get count = 2: SELECT P1.job, count(DISTINCT P2.user_id) FROM payroll P1 LEFT OUTER JOIN payroll P2 ON P1.job = P2.job and P1.salary > 75000 GROUP BY P1.job; -- to understand why it is incorrect, look at output to the full query -- make sure you understand how the LEFT OUT JOIN worked: SELECT * FROM payroll P1 LEFT OUTER JOIN payroll P2 ON P1.job = P2.job and P1.salary > 75000; -- the correct condition is P2.salary > 75000: SELECT P1.job, count(DISTINCT P2.user_id) FROM payroll P1 LEFT OUTER JOIN payroll P2 ON P1.job = P2.job and P2.salary > 75000 GROUP BY P1.job; -- also look at the full query: SELECT * FROM payroll P1 LEFT OUTER JOIN payroll P2 ON P1.job = P2.job and P2.salary > 75000; -------------------------- SELECT P1.job, P2.name, P2.salary FROM payroll AS P1, payroll AS P2 WHERE P1.job = P2.job GROUP BY P1.job, P2.name, P2.salary HAVING MAX(P1.salary) = P2.salary; ----------------------------------------------------------------------------------------------------------------------- CREATE TABLE payroll ( user_id INT PRIMARY KEY, name TEXT, job TEXT, salary INT ); INSERT INTO payroll VALUES (123, 'Jack', 'TA', 50000), (345, 'Allison', 'TA', 60000), (567, 'Magda', 'Prof', 90000), (789, 'Dan', 'Prof', 100000); CREATE TABLE regist ( user_id INT REFERENCES payroll, car TEXT ); INSERT INTO regist VALUES (123, 'Charger'), (567, 'Civic'), (567, 'Pinto'); SELECT * FROM payroll; SELECT * FROM regist; -- Jack buys a Pinto INSERT INTO regist VALUES (123, 'Pinto'); SELECT * FROM regist; -- avg salary of people who drive a Pinto. -- notice the join, which happens before aggregation. SELECT avg(salary) FROM payroll p JOIN regist r ON p.user_id = r.user_id WHERE car = 'Pinto'; -- intermediate result of above query before aggregation SELECT salary FROM payroll p JOIN regist r ON p.user_id = r.user_id WHERE car = 'Pinto'; -- Jack sells the Pinto DELETE FROM regist WHERE user_id = 123 AND car = 'Pinto'; SELECT * FROM regist; -- how many people drive a car? -- broken version: SELECT count(*) FROM payroll p JOIN regist r ON p.user_id = r.user_id; -- intermediate result of above query before aggregation SELECT * FROM payroll p JOIN regist r ON p.user_id = r.user_id; -- fixed: SELECT count(DISTINCT p.user_id) FROM payroll p JOIN regist r ON p.user_id = r.user_id; -- avg salary of people who drive a car -- broken version 1: SELECT avg(salary) FROM payroll p JOIN regist r ON p.user_id = r.user_id; -- intermediate result of above query before aggregation SELECT salary FROM payroll p JOIN regist r ON p.user_id = r.user_id; -- intermediate result with names attached SELECT name, salary FROM payroll p JOIN regist r ON p.user_id = r.user_id; -- broken version 2: SELECT avg(DISTINCT salary) FROM payroll p JOIN regist r ON p.user_id = r.user_id; -- actually returns the right answer in this case, but... -- allison gets a pay cut UPDATE payroll SET salary = 50000 WHERE user_id = 345; SELECT * FROM payroll; -- allison buys a tesla INSERT INTO regist VALUES (345, 'Tesla'); SELECT * FROM regist; -- broken version 2 again: SELECT avg(DISTINCT salary) FROM payroll p JOIN regist r ON p.user_id = r.user_id; -- now returns the wrong answer -- intermediate result with names SELECT name, salary FROM payroll p JOIN regist r ON p.user_id = r.user_id; -- this query is hard to fix with what we know. -- we will see how to fix it with a subquery next time. -- give allison her old salary back UPDATE payroll SET salary = 60000 WHERE user_id = 345; SELECT * FROM PAYROLL; -- allison sells the tesla DELETE FROM regist WHERE user_id = 345 AND car = 'Tesla'; SELECT * FROM regist; -- how many cars does each person have? SELECT p.user_id, name, count(*) FROM payroll p JOIN regist r ON p.user_id = r.user_id GROUP BY p.user_id; -- side note: the above query would be rejected by most RDBMSes because name is -- not mentioned in GROUP BY nor aggregated -- -- it is correct in SQLite because name is guaranteed by SQLite to be drawn from -- *some* row in the group, and all rows in the group have the same name (why?) -------------------------------------------------------------------------------- -- now what if we want to include people who drive 0 cars? -- broken version: SELECT p.user_id, name, count(*) FROM payroll p LEFT OUTER JOIN regist r ON p.user_id = r.user_id GROUP BY p.user_id; -- fixed: SELECT p.user_id, name, count(r.car) FROM payroll p LEFT OUTER JOIN regist r ON p.user_id = r.user_id GROUP BY p.user_id; -- for each job, how many people earn more than 75000? -- broken: SELECT job, count(*) FROM payroll WHERE salary > 75000 GROUP BY job; -- fixed SELECT p1.job, count(DISTINCT p2.user_id) FROM payroll p1 LEFT OUTER JOIN payroll p2 ON p1.job = p2.job AND p2.salary > 75000 GROUP BY p1.job; -- intermediate result for above query SELECT * FROM payroll p1 LEFT OUTER JOIN payroll p2 ON p1.job = p2.job AND p2.salary > 75000; -- find people who drive an even number of cars SELECT p.user_id, name, count(r.car) as car_count FROM payroll p LEFT OUTER JOIN regist r ON p.user_id = r.user_id GROUP BY p.user_id HAVING car_count % 2 = 0; -- find person with highest salary for each job -- step 1: find max salary SELECT job, max(salary) FROM payroll GROUP BY job; -- step 2: in SQLite, you're done. (slides are wrong about this.) SELECT job, max(salary), name FROM payroll GROUP BY job; -- now how do you do it in Postgres and every other RDBMS besides SQLite? SELECT p1.job, max(p1.salary), p2.name FROM payroll p1 JOIN payroll p2 ON p1.job = p2.job GROUP BY p1.job, p2.name, p2.salary HAVING p2.salary = max(p1.salary); -- note the use of the self-join and the group by clause. -- non-SQLite DBMSes require us to group by all the non-aggregated columns. -- We call this kind of problem "the witnessing problem" because it involves -- finding the "witness" to the maximum, ie, the row that achieves the maximum. -- Also similar to argmax.