数据分析中的MySQL

MySQL简单总结

条件查找

select * from dataAnalyst
where (city = '上海'
and education = '本科')
or 
(city = '北京'
and education = '硕士')

select * from dataAnalyst
where (city = '上海'
and education = '本科')
or workYear = '1-3年'
# 翻译：上海和本科同时满足，或者只满足workyear

select * from dataAnalyst
where city = '上海'
and (education = '本科'
or workYear = '1-3年')
# 翻译：上海的，学历或者工作经验满足

模糊查找

select * from dataAnalyst
where
secondType like '%开发%'

select * from dataAnalyst
where
secondType like '后端%'

Groupby

1 2	select city from dataAnalyst group by city

select city,count(positionId) from dataAnalyst
group by city
 # count(positionId) = count(1) = count(*)
 # 对于包不包含控制计算有细微差异
select city,count(1),count(*) from dataAnalyst
group by city

去重复——DISTINCT

1 2	select city,count(positionId),count(distinct companyId) from dataAnalyst group by city

1 2	select city,education,count(1) from dataAnalyst group by city,education

Having

# having 就是对groupby进行过滤的
select city,education,count(1) from dataAnalyst
group by city,education
having count(positionId) >= 100

Like

# 获取字段含有“电子商务”的用like
select city,count(1) from dataAnalyst
where industryField like '%电子商务%'
group by city
having count(positionId) >= 50

count if

# 这里只输出城市（嵌套查询）
select city from dataAnalyst
group by city
having count(if(industryField like '%电子商务%',1,null)) >= 50

不同城市下面电商占这个城市的招聘人数的占比并过滤大于10

select city,count(1),
count(if(industryField like '%电子商务%',industryField,null)),
count(if(industryField like '%电子商务%',industryField,null))/count(1)
from dataAnalyst
GROUP BY city
having count(if(industryField like '%电子商务%',industryField,null)) >= 10
ORDER BY count(if(industryField like '%电子商务%',industryField,null))

优雅的写法——As

select city,count(1),
count(if(industryField like '%电子商务%',industryField,null)) as emarket,
count(if(industryField like '%电子商务%',industryField,null))/count(1)
from dataAnalyst
GROUP BY city 
having emarket >= 10
ORDER BY emarket desc

SQL函数

left和locate和right

1
2
3

select left(salary,locate('k',salary)-1),salary from dataAnalyst

select 
left(salary,locate('k',salary)-1),
locate('-',salary),
length(salary),
left(RIGHT(salary,length(salary)-locate('-',salary)),length(salary)-locate('-',salary)-1),
salary
salary from dataAnalyst

substr

substr(字符串，从哪里开始，截取长度)

select 
left(salary,locate('k',salary)-1),
substr(salary,locate('-',salary)+1,length(salary)-locate('-',salary)-1),
salary from dataAnalyst

子查询

基于查询结果的查询，嵌套查询

select (bottom + top)/2 from(
select 
left(salary,locate('k',salary)-1) as bottom,
substr(salary,locate('-',salary)+1,length(salary)-locate('-',salary)-1) as top,
salary 
from dataAnalyst) as t

分组——case when

主要用来数据清洗

select
	case 
		when (bottom+top)/2 <= 10 then '0-10'
		when (bottom+top)/2 between 10 and 20 then '10-20'
		when (bottom+top)/2 <= 30 then '10-30'
		else '30+'
	end,
	salary from(
		select 
			left(salary,locate('k',salary)-1) as bottom,
			substr(salary,locate('-',salary)+1,length(salary)-locate('-',salary)-1) as top,
			salary 
		from dataAnalyst) as t

过滤——where in

select * from dataAnalyst
where city in(
select city from dataAnalyst
group by city having count(positionId) >= 100)

多表聚合查询——Join

某一个公司招聘职位，在另一个表中查找信息

select * from dataAnalyst
where companyId = (
select companyId from company
where companyShortName = '唯医网'
)

连结两张表

select * from dataAnalyst										❌
join company on companyId = companyId 			❌
# 没结果，因为companyId不唯一

#正确写法
select * from dataAnalyst as d
join company as c on d.companyId = c.companyId

各种Join

左连接

保留左边，右边硬凑，没有就null

select * from dataAnalyst as d
left join(select * from company
where companySize = '150-500人') as t
on t.companyId = d.companyId

join就是取交集，left join就是A是全部B是部分

# 排除法统计
select * from dataAnalyst as d
left join(select * from company
where companySize = '150-500人') as t
on t.companyId = d.companyId
where t.companyId is null  # 过滤重复的部分

150-500人的占比

select count(1),count(t.companyId),count(t.companyId)/count(1) from dataAnalyst as d
left join(select * from company
where companySize = '150-500人') as t
on t.companyId = d.companyId

LeetCode——训练刷题

时间

1	select now()

select 
paidTime,date(paidTime),date_format(paidTime,'%Y-%m-%d'),
date_format(date_add(paidTime,interval -1 day),'%Y-%m-%d')
from orderinfo

练习题

统计不同月份的下单人数
统计用户三月份的回购率和复购率
统计男女用户的消费频次是否有差异
统计多次消费的用户，第一次和最后一次消费间隔是多少?
统计不同年龄段，用户的消费金额是否有差异?
统计消费的二八法则，消费的top20%用户，贡献了多少额度

统计不同月份的下单人数

select month(paidTime),count(distinct userId)
from orderinfo
where isPaid = '已支付'
GROUP BY month(paidTime)

统计用户三月份的回购率和复购率

# 统计用户三月份的复购率
select count(ct),count(if(ct>1,1,null)),count(if(ct>1,1,null))/count(ct) from(
	select userId,count(userId) as ct from orderinfo
	where isPaid = '已支付'
	and month(paidTime) = 3
	group by userId) as t

不是很好的方法：

# 统计用户三月份的回购率
SELECT count(DISTINCT userId) from orderinfo
where userId in (
	select userId from orderinfo
  where isPaid = '已支付'
	and month(paidTime) = 3)
and month(paidTime) = 4

select count(DISTINCT userId) from orderinfo
where isPaid = '已支付'
and month(paidTime) = 3

首先查找出userId在每个月份的支付情况

1
2
3

select userId,date_format(paidTime,'%Y-%m') as m from orderinfo
where isPaid = '已支付'
GROUP BY userId,date_format(paidTime,'%Y-%m')

然后左连接两张相同的表：

select * from(
	select userId,date_format(paidTime,'%Y-%m') as m from orderinfo
	where isPaid = '已支付'
	GROUP BY userId,date_format(paidTime,'%Y-%m')) t1
left join(
	select userId,date_format(paidTime,'%Y-%m') as m from orderinfo
	where isPaid = '已支付'
	GROUP BY userId,date_format(paidTime,'%Y-%m')) t2
on t1.userId = t2.userId

此时得到的是两张表的笛卡尔积（左连接非唯一字段，是局部笛卡尔积。）

然后增加条件筛选，此时添加了非唯一条件，笛卡尔积自动去除

可以统计所有的月份：
select t1.m,count(t1.m),count(t2.m),count(t2.m)/count(t1.m) from(
	select userId,date_format(paidTime,'%Y-%m-01') as m from orderinfo
	where isPaid = '已支付'
	GROUP BY userId,date_format(paidTime,'%Y-%m-01')) t1
left join(
	select userId,date_format(paidTime,'%Y-%m-01') as m from orderinfo
	where isPaid = '已支付'
	GROUP BY userId,date_format(paidTime,'%Y-%m-01')) t2
on t1.userId = t2.userId and t1.m = date_sub(t2.m,interval 1 month)
GROUP BY t1.m

统计男女用户的消费频次是否有差异

平均数

select sex,avg(ct) from (
	select o.userId,sex,count(1) as ct from orderinfo as o
	inner join(
		select * from userinfo 
		where sex is not null) t
	on o.userId = t.userId
	group by userId,sex) t2
group by sex

统计多次消费的用户，第一次和最后一次消费间隔是多少?

select userId,datediff(max(paidTime),min(paidTime)) from orderinfo
where isPaid = '已支付'
group by userId having count(1) > 1
# datediff把秒数变成天数

统计不同年龄段，用户的消费金额是否有差异?

年龄除10然后取整数

select o.userId,age,count(o.userId) from orderinfo o
inner join(
	select userId,ceil((year(now())-year(birth))/10) as age from userinfo
	where birth > '1900-00-00')t
on o.userId = t.userId
group by o.userId,age

平均数

select age,avg(ct) from (
	select o.userId,age,count(o.userId) as ct from orderinfo o
	inner join(
		select userId,ceil((year(now())-year(birth))/10) as age from userinfo
		where birth > '1900-00-00')t
	on o.userId = t.userId
	group by o.userId,age)t2
group by age

统计消费的二八法则，消费的top20%用户，贡献了多少额度

先统计总人数

select count(userId)*0.2 from(
select userId,sum(price) as total from orderinfo o
where isPaid = '已支付'
group by userId
order by total desc)as t

统计贡献

select count(userId),sum(total) from(
	select userId,sum(price) as total from orderinfo o
	where isPaid = '已支付'
	group by userId
	order by total desc
	limit 17000)t

SQL连接PowerBI