Skip to main content
 首页 » 编程设计

R语言利用tidyr包处理数据实现交叉表

2022年07月19日175rubylouvre

本文使用 dplyr 和 tidyr包中函数实现较差表。

spread()函数

spread()函数来自tidyr包,可以把键值对变量展开为多列。语法如下:

spread(data, key value)

data: 数据框名称
key: 它的值将变成列变量
value: 它的值用于填充新产生变量

下面通过示例来的学习。

展开值为两列

#create data frame 
df <- data.frame(player=rep(c('A', 'B'), each=4), 
                 year=rep(c(1, 1, 2, 2), times=2), 
                 stat=rep(c('points', 'assists'), times=4), 
                 amount=c(14, 6, 18, 7, 22, 9, 38, 4)) 
 
#view data frame 
df 
 
#   player year    stat amount 
# 1      A    1  points     14 
# 2      A    1 assists      6 
# 3      A    2  points     18 
# 4      A    2 assists      7 
# 5      B    1  points     22 
# 6      B    1 assists      9 
# 7      B    2  points     38 
# 8      B    2 assists      4 
 
library(tidyr) 
 
#spread stat column across multiple columns 
spread(df, key=stat, value=amount) 
 
#   player year assists points 
# 1      A    1       6     14 
# 2      A    2       7     18 
# 3      B    1       9     22 
# 4      B    2       4     38 

超过两列示例

#create data frame 
df2 <- data.frame(player=rep(c('A'), times=8), 
                 year=rep(c(1, 2), each=4), 
                 stat=rep(c('points', 'assists', 'steals', 'blocks'), times=2), 
                 amount=c(14, 6, 2, 1, 29, 9, 3, 4)) 
 
#view data frame 
df2 
 
#   player year    stat amount 
# 1      A    1  points     14 
# 2      A    1 assists      6 
# 3      A    1  steals      2 
# 4      A    1  blocks      1 
# 5      A    2  points     29 
# 6      A    2 assists      9 
# 7      A    2  steals      3 
# 8      A    2  blocks      4 
 
library(tidyr) 
 
#spread stat column across multiple columns 
spread(df2, key=stat, value=amount) 
 
#   player year assists blocks points steals 
# 1      A    1       6      1     14      2 
# 2      A    2       9      4     29      3 
 

spreand() 分组场景

语法如下:

 
# df %>% 
#   group_by(var1, var2) %>% 
#   tally() %>% 
#   spread(var1, n) 
 

下面通过示例来学习。

创建基础交叉表

首先定义示例数据框:

#create data frame 
df <- data.frame(team=c('T1', 'T1', 'T1', 'T1', 'T2', 'T2', 'T2', 'T2'), 
                 position=c('G', 'G', 'F', 'C', 'G', 'F', 'F', 'C'), 
                 points=c(7, 7, 8, 11, 13, 15, 19, 13)) 
 
#view data frame 
df 
 
#   team position points 
# 1   T1        G      7 
# 2   T1        G      7 
# 3   T1        F      8 
# 4   T1        C     11 
# 5   T2        G     13 
# 6   T2        F     15 
# 7   T2        F     19 
# 8   T2        C     13 

下面代码使用team 和 position 变量创建交叉表:

library(dplyr) 
library(tidyr) 
 
#produce crosstab  
df %>% 
  group_by(team, position) %>% 
  tally() %>% 
  spread(position, n) 
 
# A tibble: 2 x 4 
#   team      C     F     G 
#   <chr> <int> <int> <int> 
# 1 T1        1     1     2 
# 2 T2        1     2     1 
 

返回结果表示T1团队在C,F,G位置上的人数分别为1,1,2;T2团队在C,F,G位置上的人数分别为1,2,1。
如果我们展开团队变量呢,请看代码:

 
df %>% 
  group_by(team, position) %>% 
  tally() %>% 
  spread(team, n) 
 
# A tibble: 3 x 3 
# # Groups:   position [3] 
#   position    T1    T2 
#   <chr>    <int> <int> 
# 1 C            1     1 
# 2 F            1     2 
# 3 G            2     1 

结果非常直接,关键是*spread(team, n)*实现方式非常简单且统一。

上面使用了tally函数。该函数是对summarise()函数的包装,同时依据第一次计算或重新计算而调用n() 、sum() 函数。相当于 count()函数,但之前要调用group_by() ,之后要调用ungroup().

tally()函数

我们看看tally()函数示例。

# tally() is short-hand for summarise() 
mtcars %>% tally() 
#>    n 
#> 1 32 
mtcars %>% group_by(cyl) %>% tally() 
#> # A tibble: 3 x 2 
#>     cyl     n 
#>   <dbl> <int> 
#> 1     4    11 
#> 2     6     7 
#> 3     8    14 
# count() is a short-hand for group_by() + tally() 
mtcars %>% count(cyl) 
#> # A tibble: 3 x 2 
#>     cyl     n 
#>   <dbl> <int> 
#> 1     4    11 
#> 2     6     7 
#> 3     8    14 
# Note that if the data is already grouped, count() adds 
# an additional group that is removed afterwards 
mtcars %>% group_by(gear) %>% count(carb) 
#> # A tibble: 11 x 3 
#> # Groups:   gear [3] 
#>     gear  carb     n 
#>    <dbl> <dbl> <int> 
#>  1     3     1     3 
#>  2     3     2     4 
#>  3     3     3     3 
#>  4     3     4     5 
#>  5     4     1     4 
#>  6     4     2     4 
#>  7     4     4     4 
#>  8     5     2     2 
#>  9     5     4     1 
#> 10     5     6     1 
#> 11     5     8     1 
 
# add_tally() is short-hand for mutate() 
mtcars %>% add_tally() 
#> # A tibble: 32 x 12 
#>      mpg   cyl  disp    hp  drat    wt  qsec    vs    am  gear  carb     n 
#>    <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <int> 
#>  1  21       6  160    110  3.9   2.62  16.5     0     1     4     4    32 
#>  2  21       6  160    110  3.9   2.88  17.0     0     1     4     4    32 
#>  3  22.8     4  108     93  3.85  2.32  18.6     1     1     4     1    32 
#>  4  21.4     6  258    110  3.08  3.22  19.4     1     0     3     1    32 
#>  5  18.7     8  360    175  3.15  3.44  17.0     0     0     3     2    32 
#>  6  18.1     6  225    105  2.76  3.46  20.2     1     0     3     1    32 
#>  7  14.3     8  360    245  3.21  3.57  15.8     0     0     3     4    32 
#>  8  24.4     4  147.    62  3.69  3.19  20       1     0     4     2    32 
#>  9  22.8     4  141.    95  3.92  3.15  22.9     1     0     4     2    32 
#> 10  19.2     6  168.   123  3.92  3.44  18.3     1     0     4     4    32 
#> # … with 22 more rows 
# add_count() is a short-hand for group_by() + add_tally() 
mtcars %>% add_count(cyl) 
#> # A tibble: 32 x 12 
#>      mpg   cyl  disp    hp  drat    wt  qsec    vs    am  gear  carb     n 
#>    <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <int> 
#>  1  21       6  160    110  3.9   2.62  16.5     0     1     4     4     7 
#>  2  21       6  160    110  3.9   2.88  17.0     0     1     4     4     7 
#>  3  22.8     4  108     93  3.85  2.32  18.6     1     1     4     1    11 
#>  4  21.4     6  258    110  3.08  3.22  19.4     1     0     3     1     7 
#>  5  18.7     8  360    175  3.15  3.44  17.0     0     0     3     2    14 
#>  6  18.1     6  225    105  2.76  3.46  20.2     1     0     3     1     7 
#>  7  14.3     8  360    245  3.21  3.57  15.8     0     0     3     4    14 
#>  8  24.4     4  147.    62  3.69  3.19  20       1     0     4     2    11 
#>  9  22.8     4  141.    95  3.92  3.15  22.9     1     0     4     2    11 
#> 10  19.2     6  168.   123  3.92  3.44  18.3     1     0     4     4     7 
#> # … with 22 more rows 
 
# count() and tally() are designed so that you can call 
# them repeatedly, each time rolling up a level of detail 

本文参考链接:https://blog.csdn.net/neweastsun/article/details/121287785
阅读延展