数据分析中的MySQL

MySQL简单总结

条件查找

1
2
3
4
5
6
select * from dataAnalyst
where (city = '上海'
and education = '本科')
or
(city = '北京'
and education = '硕士')
1
2
3
4
5
6
7
8
9
10
11
select * from dataAnalyst
where (city = '上海'
and education = '本科')
or workYear = '1-3年'
# 翻译:上海和本科同时满足,或者只满足workyear

select * from dataAnalyst
where city = '上海'
and (education = '本科'
or workYear = '1-3年')
# 翻译:上海的,学历或者工作经验满足
模糊查找
1
2
3
4
5
6
7
select * from dataAnalyst
where
secondType like '%开发%'

select * from dataAnalyst
where
secondType like '后端%'

Groupby

1
2
select city from dataAnalyst
group by city
1
2
3
4
5
6
select city,count(positionId) from dataAnalyst
group by city
# count(positionId) = count(1) = count(*)
# 对于包不包含控制计算有细微差异
select city,count(1),count(*) from dataAnalyst
group by city
去重复——DISTINCT
1
2
select city,count(positionId),count(distinct companyId) from dataAnalyst
group by city
1
2
select city,education,count(1) from dataAnalyst
group by city,education
Having
1
2
3
4
# having 就是对groupby进行过滤的
select city,education,count(1) from dataAnalyst
group by city,education
having count(positionId) >= 100
Like
1
2
3
4
5
# 获取字段含有“电子商务”的用like
select city,count(1) from dataAnalyst
where industryField like '%电子商务%'
group by city
having count(positionId) >= 50
count if
1
2
3
4
# 这里只输出城市(嵌套查询)
select city from dataAnalyst
group by city
having count(if(industryField like '%电子商务%',1,null)) >= 50

不同城市下面电商占这个城市的招聘人数的占比并过滤大于10

1
2
3
4
5
6
7
select city,count(1),
count(if(industryField like '%电子商务%',industryField,null)),
count(if(industryField like '%电子商务%',industryField,null))/count(1)
from dataAnalyst
GROUP BY city
having count(if(industryField like '%电子商务%',industryField,null)) >= 10
ORDER BY count(if(industryField like '%电子商务%',industryField,null))

优雅的写法——As

1
2
3
4
5
6
7
select city,count(1),
count(if(industryField like '%电子商务%',industryField,null)) as emarket,
count(if(industryField like '%电子商务%',industryField,null))/count(1)
from dataAnalyst
GROUP BY city
having emarket >= 10
ORDER BY emarket desc

SQL函数

left和locate和right
1
2
3
select left(salary,locate('k',salary)-1),salary from dataAnalyst


1
2
3
4
5
6
7
select 
left(salary,locate('k',salary)-1),
locate('-',salary),
length(salary),
left(RIGHT(salary,length(salary)-locate('-',salary)),length(salary)-locate('-',salary)-1),
salary
salary from dataAnalyst

substr

substr(字符串,从哪里开始,截取长度)

1
2
3
4
select 
left(salary,locate('k',salary)-1),
substr(salary,locate('-',salary)+1,length(salary)-locate('-',salary)-1),
salary from dataAnalyst

子查询

基于查询结果的查询,嵌套查询

1
2
3
4
5
6
select (bottom + top)/2 from(
select
left(salary,locate('k',salary)-1) as bottom,
substr(salary,locate('-',salary)+1,length(salary)-locate('-',salary)-1) as top,
salary
from dataAnalyst) as t

分组——case when

主要用来数据清洗

1
2
3
4
5
6
7
8
9
10
11
12
13
select
case
when (bottom+top)/2 <= 10 then '0-10'
when (bottom+top)/2 between 10 and 20 then '10-20'
when (bottom+top)/2 <= 30 then '10-30'
else '30+'
end,
salary from(
select
left(salary,locate('k',salary)-1) as bottom,
substr(salary,locate('-',salary)+1,length(salary)-locate('-',salary)-1) as top,
salary
from dataAnalyst) as t

过滤——where in
1
2
3
4
select * from dataAnalyst
where city in(
select city from dataAnalyst
group by city having count(positionId) >= 100)

多表聚合查询——Join

某一个公司招聘职位,在另一个表中查找信息

1
2
3
4
5
select * from dataAnalyst
where companyId = (
select companyId from company
where companyShortName = '唯医网'
)
连结两张表
1
2
3
4
5
6
7
select * from dataAnalyst										❌
join company on companyId = companyId ❌
# 没结果,因为companyId不唯一

#正确写法
select * from dataAnalyst as d
join company as c on d.companyId = c.companyId

各种Join

左连接

保留左边,右边硬凑,没有就null

1
2
3
4
select * from dataAnalyst as d
left join(select * from company
where companySize = '150-500人') as t
on t.companyId = d.companyId

join就是取交集,left join就是A是全部B是部分

1
2
3
4
5
6
# 排除法统计
select * from dataAnalyst as d
left join(select * from company
where companySize = '150-500人') as t
on t.companyId = d.companyId
where t.companyId is null # 过滤重复的部分

150-500人的占比

1
2
3
4
select count(1),count(t.companyId),count(t.companyId)/count(1) from dataAnalyst as d
left join(select * from company
where companySize = '150-500人') as t
on t.companyId = d.companyId

LeetCode——训练刷题

时间

1
select now()
1
2
3
4
select 
paidTime,date(paidTime),date_format(paidTime,'%Y-%m-%d'),
date_format(date_add(paidTime,interval -1 day),'%Y-%m-%d')
from orderinfo

练习题

统计不同月份的下单人数
统计用户三月份的回购率和复购率
统计男女用户的消费频次是否有差异
统计多次消费的用户,第一次和最后一次消费间隔是多少?
统计不同年龄段,用户的消费金额是否有差异?
统计消费的二八法则,消费的top20%用户,贡献了多少额度

统计不同月份的下单人数

1
2
3
4
select month(paidTime),count(distinct userId)
from orderinfo
where isPaid = '已支付'
GROUP BY month(paidTime)

统计用户三月份的回购率和复购率

1
2
3
4
5
6
# 统计用户三月份的复购率
select count(ct),count(if(ct>1,1,null)),count(if(ct>1,1,null))/count(ct) from(
select userId,count(userId) as ct from orderinfo
where isPaid = '已支付'
and month(paidTime) = 3
group by userId) as t

不是很好的方法:

1
2
3
4
5
6
7
8
9
10
11
# 统计用户三月份的回购率
SELECT count(DISTINCT userId) from orderinfo
where userId in (
select userId from orderinfo
where isPaid = '已支付'
and month(paidTime) = 3)
and month(paidTime) = 4

select count(DISTINCT userId) from orderinfo
where isPaid = '已支付'
and month(paidTime) = 3

首先查找出userId在每个月份的支付情况

1
2
3
select userId,date_format(paidTime,'%Y-%m') as m from orderinfo
where isPaid = '已支付'
GROUP BY userId,date_format(paidTime,'%Y-%m')

然后左连接两张相同的表:

1
2
3
4
5
6
7
8
9
select * from(
select userId,date_format(paidTime,'%Y-%m') as m from orderinfo
where isPaid = '已支付'
GROUP BY userId,date_format(paidTime,'%Y-%m')) t1
left join(
select userId,date_format(paidTime,'%Y-%m') as m from orderinfo
where isPaid = '已支付'
GROUP BY userId,date_format(paidTime,'%Y-%m')) t2
on t1.userId = t2.userId

此时得到的是两张表的笛卡尔积(左连接非唯一字段,是局部笛卡尔积。

然后增加条件筛选,此时添加了非唯一条件,笛卡尔积自动去除

1
2
3
4
5
6
7
8
9
10
11
可以统计所有的月份:
select t1.m,count(t1.m),count(t2.m),count(t2.m)/count(t1.m) from(
select userId,date_format(paidTime,'%Y-%m-01') as m from orderinfo
where isPaid = '已支付'
GROUP BY userId,date_format(paidTime,'%Y-%m-01')) t1
left join(
select userId,date_format(paidTime,'%Y-%m-01') as m from orderinfo
where isPaid = '已支付'
GROUP BY userId,date_format(paidTime,'%Y-%m-01')) t2
on t1.userId = t2.userId and t1.m = date_sub(t2.m,interval 1 month)
GROUP BY t1.m

统计男女用户的消费频次是否有差异

平均数
1
2
3
4
5
6
7
8
select sex,avg(ct) from (
select o.userId,sex,count(1) as ct from orderinfo as o
inner join(
select * from userinfo
where sex is not null) t
on o.userId = t.userId
group by userId,sex) t2
group by sex

统计多次消费的用户,第一次和最后一次消费间隔是多少?

1
2
3
4
select userId,datediff(max(paidTime),min(paidTime)) from orderinfo
where isPaid = '已支付'
group by userId having count(1) > 1
# datediff把秒数变成天数

统计不同年龄段,用户的消费金额是否有差异?

年龄除10然后取整数

1
2
3
4
5
6
select o.userId,age,count(o.userId) from orderinfo o
inner join(
select userId,ceil((year(now())-year(birth))/10) as age from userinfo
where birth > '1900-00-00')t
on o.userId = t.userId
group by o.userId,age

平均数

1
2
3
4
5
6
7
8
select age,avg(ct) from (
select o.userId,age,count(o.userId) as ct from orderinfo o
inner join(
select userId,ceil((year(now())-year(birth))/10) as age from userinfo
where birth > '1900-00-00')t
on o.userId = t.userId
group by o.userId,age)t2
group by age

统计消费的二八法则,消费的top20%用户,贡献了多少额度

先统计总人数

1
2
3
4
5
select count(userId)*0.2 from(
select userId,sum(price) as total from orderinfo o
where isPaid = '已支付'
group by userId
order by total desc)as t

统计贡献

1
2
3
4
5
6
select count(userId),sum(total) from(
select userId,sum(price) as total from orderinfo o
where isPaid = '已支付'
group by userId
order by total desc
limit 17000)t

SQL连接PowerBI

0%