APC模型

读取相关R包

library(magrittr)
library(dplyr)
library(tidyverse)
library(data.table)
source('source_apc.R')

读取人口学数据

dir <- "path/to/GBD_Population"
files <- list.files(dir,
                    pattern = ".CSV",
                    full.names = T)
GBD_population <- map_dfr(files,fread) %>% as.data.frame()
## 对人口学数据进行整理
## 取我们需要的列名
GBD_population <- GBD_population %>% 
  select(location_name,sex_name,age_group_name,year_id,metric_name,val) 
## 采用case_when函数 对sex的标签进行重新命名
GBD_population <- GBD_population %>% mutate(sex_name = case_when(
                                          sex_name == "both"  ~ "Both",
                                          sex_name == "male"  ~ "Male",
                                          sex_name == "female" ~ "Female"))

names(GBD_population) <- c("location","sex","age","year","metric","val")

整理疾病数据

## 对疾病数据进行整理
case <- read.csv('糖尿病.csv')
# 按照下面条件筛选数据
DM <- case %>%
  filter(measure=="Prevalence") %>% 
  filter(cause!="Diabetes mellitus") %>%
  filter(metric=="Number") %>% 
  filter(age!="All Ages")

匹配筛选人口数据

## 合并疾病和人口学数据
df <- left_join(DM,GBD_population,by=c("location","sex","age","metric","year"))

colnames(df)[8] <- "prevalence"
colnames(df)[11] <- "population"


df <- df %>% 
  mutate(population=round(population,digits = 0)) %>% 
  mutate(prevalence=round(prevalence,digits = 0)) %>% 
  select(location,sex,age,cause,year,prevalence,population) %>% 
  arrange(location,sex,age,cause,year) 
# 筛选不同年龄组的数据
ages <- c("15 to 19","20 to 24","25 to 29","30 to 34", "35 to 39", "40 to 44", "45 to 49", "50 to 54", "55 to 59",        
          "60 to 64", "65 to 69", "70 to 74", "75 to 79", "80 to 84", "85 to 89", "90 to 94",'95 plus')  

df <- df %>% filter(prevalence > 0 &
                    age %in% ages) %>% #删除缺失值
            mutate(age=sub('95 plus',replacement = '95 to 99', age)) %>% 
            mutate(age=factor(age,levels = ages,ordered = T)) 
            
## 对year重新命名
df <- df %>% mutate(year=case_when(
                                  year<1995 ~ "1990~1994",
                                  year<2000 & year>=1995 ~ "1995~1999",
                                  year<2005 & year>=2000 ~ "2000~2004",
                                  year<2010 & year>=1995 ~ "2005~2009",
                                  year<2015 & year>=2000 ~ "2010~2014",
                                  year<2020 & year>=2015 ~ "2015~2019"))

df2 <- df %>% 
  group_by(location,sex,cause,age,year) %>% 
  mutate(case=round(mean(prevalence),digits = 0)) %>% 
  mutate(population2=round(mean(population),digits = 0)) %>% 
  select(location,sex,age,cause,year,case,population2) %>% 
  unique()
## 挑选除全球、男性、不同年龄段的、不同年份的数据
## 将数据转换成行为年龄,列为年的数据
apc_case <- df2 %>% filter(location=="Global" & sex== "Male")
apc_case_preval <- reshape2::dcast(apc_case,age~year,value.var = 'case') %>% 
                     arrange(age) %>% 
                      select(-1)
## 将数据转换成行为年龄,列为年的数据
apc_case_pop <- reshape2::dcast(apc_case,age~year,value.var = 'population2') %>% 
                     arrange(age) %>% 
                        select(-1)
head(apc_case_preval)
课程示例来源   1990~1994 1995~1999 2000~2004 2005~2009 2010~2014 2015~2019
课程示例来源 1    446019    511722    617877    742345    834629    918286
课程示例来源 2   1333600   1415518   1621238   2155945   2696779   2807725
课程示例来源 3   2524464   2863499   3085007   3632505   4723582   5665627
课程示例来源 4   3566148   4384714   4999932   5591028   6546156   8364953
课程示例来源 5   5095827   5781665   7123508   8411145   9453526  11195830
课程示例来源 6   6395007   7649305   8821923  11244421  13335295  15163028
head(apc_case_pop)
课程示例来源   1990~1994 1995~1999 2000~2004 2005~2009 2010~2014 2015~2019
课程示例来源 1 263489077 275964699 305287790 320884895 312624286 314526879
课程示例来源 2 252444621 254270673 266409630 295459239 311510043 304807073
课程示例来源 3 234080057 249773178 251850669 263859000 291488843 307325657
课程示例来源 4 203974876 233248699 248120178 252430916 264438465 291326984
课程示例来源 5 186252870 202377891 231761665 247151662 252004962 263678889
课程示例来源 6 158058040 182004469 197196451 226075241 241663285 246950961
## 将数据整理成 APC网页版需要的数据格式:疾病和人口数据交替排列
for (i in 1:(ncol(apc_case_preval)+ncol(apc_case_pop))){
  if(i == 1){
    result <-apc_case_preval[,i] %>% as.data.frame()}
  else{
    if(i%%2==0){
      result <- cbind(result,apc_case_pop[,ceiling(i/2)])}
    else{
      result <- cbind(result,apc_case_preval[,ceiling(i/2)])}
  }
}

head(result)
课程示例来源         . apc_case_pop[, ceiling(i/2)] apc_case_preval[, ceiling(i/2)]
课程示例来源 1  446019                    263489077                          511722
课程示例来源 2 1333600                    252444621                         1415518
课程示例来源 3 2524464                    234080057                         2863499
课程示例来源 4 3566148                    203974876                         4384714
课程示例来源 5 5095827                    186252870                         5781665
课程示例来源 6 6395007                    158058040                         7649305
课程示例来源   apc_case_pop[, ceiling(i/2)] apc_case_preval[, ceiling(i/2)]
课程示例来源 1                    275964699                          617877
课程示例来源 2                    254270673                         1621238
课程示例来源 3                    249773178                         3085007
课程示例来源 4                    233248699                         4999932
课程示例来源 5                    202377891                         7123508
课程示例来源 6                    182004469                         8821923
课程示例来源   apc_case_pop[, ceiling(i/2)] apc_case_preval[, ceiling(i/2)]
课程示例来源 1                    305287790                          742345
课程示例来源 2                    266409630                         2155945
课程示例来源 3                    251850669                         3632505
课程示例来源 4                    248120178                         5591028
课程示例来源 5                    231761665                         8411145
课程示例来源 6                    197196451                        11244421
课程示例来源   apc_case_pop[, ceiling(i/2)] apc_case_preval[, ceiling(i/2)]
课程示例来源 1                    320884895                          834629
课程示例来源 2                    295459239                         2696779
课程示例来源 3                    263859000                         4723582
课程示例来源 4                    252430916                         6546156
课程示例来源 5                    247151662                         9453526
课程示例来源 6                    226075241                        13335295
课程示例来源   apc_case_pop[, ceiling(i/2)] apc_case_preval[, ceiling(i/2)]
课程示例来源 1                    312624286                          918286
课程示例来源 2                    311510043                         2807725
课程示例来源 3                    291488843                         5665627
课程示例来源 4                    264438465                         8364953
课程示例来源 5                    252004962                        11195830
课程示例来源 6                    241663285                        15163028
课程示例来源   apc_case_pop[, ceiling(i/2)]
课程示例来源 1                    314526879
课程示例来源 2                    304807073
课程示例来源 3                    307325657
课程示例来源 4                    291326984
课程示例来源 5                    263678889
课程示例来源 6                    246950961
## APC模型进一步处理
R <- prepare_rates(result,
                      StartYear=1990,StartAge=15,Interval=5,
                      fullname='',description='') 
## APC模型计算
M <- apc2fit(R)
## 画图
plot.apc1(M)

decomposition analysis

将疾病发病数/死亡数/DALY转分解成3个因素:人口数变化、年龄结构变化、疾病本身流行病学改变

R包读取

library(dplyr)
library(data.table)
library(purrr)
library(tidyr)
library(ggplot2)
library(ggsci)

读取疾病数据

case <- read.csv('EC_predict.csv')
ages <-  c("20 to 24", "25 to 29",
          "30 to 34", "35 to 39", "40 to 44", "45 to 49", "50 to 54", "55 to 59",
          "60 to 64", "65 to 69", "70 to 74", "75 to 79", "80 to 84", "85 to 89",
          "90 to 94", "95 plus")
## sub函数:将年龄的label,比如"15-19 years"转换成“15 to 19”,以匹配人口学数据和疾病数据
case <- case %>% mutate(age=sub('-',replacement = ' to ', age)) %>% 
  mutate(age=sub(' years',replacement = '', age)) %>% 
  mutate(age=sub(' year',replacement = '', age)) %>% 
  mutate(age=sub('95\\+',replacement = '95 plus', age)) %>% 
  mutate(age=sub('Age to standardized',replacement = 'Age-standardized', age)) %>% 
  mutate(age=sub('<5',replacement = 'Under 5', age)) %>% 
  filter(val>0) %>% 
  filter(age %in% ages) %>% 
  mutate(age = factor(age,levels = ages, ordered = T))
str(case)
课程示例来源 'data.frame': 34560 obs. of  10 variables:
课程示例来源  $ measure : chr  "Deaths" "Deaths" "Deaths" "Deaths" ...
课程示例来源  $ location: chr  "Global" "Global" "Global" "Global" ...
课程示例来源  $ sex     : chr  "Male" "Female" "Both" "Male" ...
课程示例来源  $ age     : Ord.factor w/ 16 levels "20 to 24"<"25 to 29"<..: 1 1 1 1 1 1 2 2 2 2 ...
课程示例来源  $ cause   : chr  "Esophageal cancer" "Esophageal cancer" "Esophageal cancer" "Esophageal cancer" ...
课程示例来源  $ metric  : chr  "Number" "Number" "Number" "Rate" ...
课程示例来源  $ year    : int  1990 1990 1990 1990 1990 1990 1990 1990 1990 1990 ...
课程示例来源  $ val     : num  204.0673 211.5666 415.6338 0.0822 0.0866 ...
课程示例来源  $ upper   : num  233.1789 251.6184 473.4528 0.0939 0.103 ...
课程示例来源  $ lower   : num  147.8799 148.4676 308.4978 0.0596 0.0608 ...

读取人口学数据

dir <- "path/to/GBD_Population"
files <- list.files(dir,
                    pattern = ".CSV",
                    full.names = T)
## 这里我们只需要2年:2019和1990年,我们需要比较2019年和1990的情况
GBD_population <- map_dfr(files[c(1,30)],fread) %>% as.data.frame()

GBD_population <- GBD_population %>% mutate(sex_name = case_when(
  sex_name == "both"  ~ "Both",
  sex_name == "male"  ~ "Male",
  sex_name == "female" ~ "Female"))

population <- GBD_population %>% 
                select("location_name","sex_name","year_id","age_group_name","val") %>% 
                filter(age_group_name %in% ages) %>% 
                mutate(age_group_name = factor(age_group_name, levels = ages,ordered = T))

str(population)
课程示例来源 'data.frame': 64800 obs. of  5 variables:
课程示例来源  $ location_name : chr  "Global" "Global" "Global" "Global" ...
课程示例来源  $ sex_name      : chr  "Male" "Male" "Male" "Male" ...
课程示例来源  $ year_id       : int  1990 1990 1990 1990 1990 1990 1990 1990 1990 1990 ...
课程示例来源  $ age_group_name: Ord.factor w/ 16 levels "20 to 24"<"25 to 29"<..: 1 2 3 4 5 6 7 8 9 10 ...
课程示例来源  $ val           : num  2.48e+08 2.23e+08 1.95e+08 1.79e+08 1.46e+08 ...

我们以单个国家给大家进行讲解

分解分析数据准备

## 这里,我们以全球男性发病数据为例,进行decomposition的讲解
## 获取1990年不同年龄段的占比
  Global_population_1990 <- population %>% 
    filter(location_name == 'Global' & 
             year_id == 1990 &
             sex_name == 'Male') %>% 
    arrange(age_group_name) ### 按年龄从小到大排序

  Global_1990 <- sum(Global_population_1990$val)
  Global_population_1990$percent <- Global_population_1990$val/Global_1990 ### 获取1990年不同年龄段的占比
  
## 获取2019年不同年龄段的占比
  Global_population_2019 <- population %>% 
    filter(location_name == 'Global' & 
             year_id == 2019 &
             sex_name == 'Male') %>% 
    arrange(age_group_name) ### 按年龄从小到大排序

  Global_2019 <- sum(Global_population_2019$val)
  Global_population_2019$percent <- Global_population_2019$val/Global_2019 ### 获取2019年不同年龄段的占比
  
### a代表年龄占比,p代表总人群数
a_1990 <- Global_population_1990$percent
a_2019 <- Global_population_2019$percent
p_1990 <- Global_1990
p_2019 <- Global_2019

## 获取率的数据
  case_1990 <- case %>% filter(year == 1990 &
                                sex == 'Male' &
                                 location == 'Global' &
                                 metric == 'Rate' &
                                 measure == 'Incidence') %>% 
                        arrange(age)
  
  case_2019 <- case %>% filter(year == 2019 &
                                sex == 'Male' &
                                 location == 'Global' &
                                 metric == 'Rate' &
                                 measure == 'Incidence') %>% 
                        arrange(age)
  
  r_1990 <- as.numeric(case_1990$val)/10^5 ## 单位转换
  r_2019 <- as.numeric(case_2019$val)/10^5 ## 单位转换

decomposition结果计算

计算原理与公式 疾病数的计算原理

分解分析的原理:

  ## 根据公式计算
  a_effect <- round((sum(a_2019*p_1990*r_1990) + sum(a_2019*p_2019*r_2019))/3 + 
                      (sum(a_2019*p_1990*r_2019) + sum(a_2019*p_2019*r_1990))/6 -
                      (sum(a_1990*p_1990*r_1990) + sum(a_1990*p_2019*r_2019))/3 -
                      (sum(a_1990*p_1990*r_2019) + sum(a_1990*p_2019*r_1990))/6,3)
  
  p_effect <- round((sum(a_1990*p_2019*r_1990) + sum(a_2019*p_2019*r_2019))/3 + 
                      (sum(a_1990*p_2019*r_2019) + sum(a_2019*p_2019*r_1990))/6 -
                      (sum(a_1990*p_1990*r_1990) + sum(a_2019*p_1990*r_2019))/3 -
                      (sum(a_1990*p_1990*r_2019) + sum(a_2019*p_1990*r_1990))/6,3)
  
  r_effect <- round((sum(a_1990*p_1990*r_2019) + sum(a_2019*p_2019*r_2019))/3 + 
                      (sum(a_1990*p_2019*r_2019) + sum(a_2019*p_1990*r_2019))/6 -
                      (sum(a_1990*p_1990*r_1990) + sum(a_2019*p_2019*r_1990))/3 -
                      (sum(a_1990*p_2019*r_1990) + sum(a_2019*p_1990*r_1990))/6,3)
  
  overll_differ <- round(a_effect + p_effect + r_effect,3)

循环计算6个地区的结果

## decomposition calculation
decomposition_name <- c('location','overll_difference','a_effect','p_effect','r_effect','a_percent',
                        'p_percent','r_percent')
decomposition_data <- as.data.frame(matrix(nrow=0,ncol=length(decomposition_name))) 
names(decomposition_data) <- decomposition_name

loc_name <- c("Global","High SDI","High-middle SDI","Middle SDI","Low-middle SDI","Low SDI")
         

for (a in loc_name) {
  Global_population_1990 <- population %>% 
    filter(location_name == a & 
             year_id == 1990 &
             sex_name == 'Male') %>% 
    arrange(age_group_name) ### 按年龄从小到大排序

  Global_1990 <- sum(Global_population_1990$val) ## 计算1990人口总数
  Global_population_1990$percent <- Global_population_1990$val/Global_1990 ### 获取1990年不同年龄段的占比
  
## 获取2019年不同年龄段的占比
  Global_population_2019 <- population %>% 
    filter(location_name == a & 
             year_id == 2019 &
             sex_name == 'Male') %>% 
    arrange(age_group_name) ### 按年龄从小到大排序

  Global_2019 <- sum(Global_population_2019$val) ## 计算2019人口总数
  Global_population_2019$percent <- Global_population_2019$val/Global_2019 ### 获取2019年不同年龄段的占比
  
### a代表年龄占比,p代表总人群数
a_1990 <- Global_population_1990$percent
a_2019 <- Global_population_2019$percent
p_1990 <- Global_1990
p_2019 <- Global_2019

## 获取率的数据
  case_1990 <- case %>% filter(year == 1990 &
                                sex == 'Male' &
                                 location == a &
                                 metric == 'Rate' &
                                 measure == 'Incidence') %>% 
                        arrange(age)
  
  case_2019 <- case %>% filter(year == 2019 &
                                sex == 'Male' &
                                 location == a &
                                 metric == 'Rate' &
                                 measure == 'Incidence') %>% 
                        arrange(age)
  
  r_1990 <- as.numeric(case_1990$val)/10^5 ## 单位转换
  r_2019 <- as.numeric(case_2019$val)/10^5 ## 单位转换
  
  ##### 根据公式计算
  a_effect <- round((sum(a_2019*p_1990*r_1990) + sum(a_2019*p_2019*r_2019))/3 + 
                      (sum(a_2019*p_1990*r_2019) + sum(a_2019*p_2019*r_1990))/6 -
                      (sum(a_1990*p_1990*r_1990) + sum(a_1990*p_2019*r_2019))/3 -
                      (sum(a_1990*p_1990*r_2019) + sum(a_1990*p_2019*r_1990))/6,3)
  
  p_effect <- round((sum(a_1990*p_2019*r_1990) + sum(a_2019*p_2019*r_2019))/3 + 
                      (sum(a_1990*p_2019*r_2019) + sum(a_2019*p_2019*r_1990))/6 -
                      (sum(a_1990*p_1990*r_1990) + sum(a_2019*p_1990*r_2019))/3 -
                      (sum(a_1990*p_1990*r_2019) + sum(a_2019*p_1990*r_1990))/6,3)
  
  r_effect <- round((sum(a_1990*p_1990*r_2019) + sum(a_2019*p_2019*r_2019))/3 + 
                      (sum(a_1990*p_2019*r_2019) + sum(a_2019*p_1990*r_2019))/6 -
                      (sum(a_1990*p_1990*r_1990) + sum(a_2019*p_2019*r_1990))/3 -
                      (sum(a_1990*p_2019*r_1990) + sum(a_2019*p_1990*r_1990))/6,3)
  
  overll_differ <- round(a_effect + p_effect + r_effect,2)
  a_percent <- round(a_effect/overll_differ*100,2)
  p_percent <- round(p_effect/overll_differ*100,2)
  r_percent <- round(r_effect/overll_differ*100,2)
  temp <- c(a,overll_differ,a_effect,p_effect,r_effect,a_percent,
                              p_percent,r_percent) %>% t() %>% as.data.frame()
  names(temp) <- decomposition_name
  decomposition_data <- rbind(decomposition_data,temp)
}

判断结果计算是否正确

data <- read.csv("EC_predict.csv")
num_1990 <- data %>% filter(age == 'All ages' &
                         sex == 'Male' &
                         metric == 'Number' &
                           year == 1990 &
                           measure == 'Incidence') %>% 
                          select(location, val) %>% 
                          rename(val_1990 = val)

num_2019 <- data %>% filter(age == 'All ages' &
                              sex == 'Male' &
                              metric == 'Number' &
                              year == 2019 &
                              measure == 'Incidence') %>% 
                            select(location, val) %>% 
                           rename(val_2019 = val)


decomposition_data <- left_join(decomposition_data,num_1990, by = 'location') %>% 
                       left_join(num_2019, by = 'location')

decomposition_data$diff <- decomposition_data$val_2019 - decomposition_data$val_1990
decomposition_data[,2:11] <- decomposition_data[,2:11] %>% apply(c(1,2),as.numeric)
round(decomposition_data$diff) == round(decomposition_data$overll_difference)
课程示例来源 [1] TRUE TRUE TRUE TRUE TRUE TRUE

decomposition作图

names(decomposition_data)[2:5] <- c('Overll difference','Aging','Population','Epidemiological change')

decomposition_plot <- decomposition_data[c(1:5)] %>% 
                     pivot_longer(Aging:`Epidemiological change`,
                                  names_to = "varname",
                                  values_to = "value") %>% 
                     mutate(value=as.numeric(value)) %>% 
                     mutate(location=factor(location,levels = loc_name,ordered = T))

p <- ggplot(decomposition_plot, aes(x= location,y=value, fill= varname)) +
  geom_bar(stat="identity",position = "stack") +
  coord_flip() + scale_fill_nejm() + 
  theme_bw()


decomposition_data$`Overll difference` <- as.numeric(decomposition_data$`Overll difference`)
plot <- p+ geom_point(data=decomposition_data, mapping=aes(x=location,y= `Overll difference`),fill='black',color='black',size=3)

Frontier analysis

载入R包

library(dplyr)
library(data.table)
library(purrr)
library(tidyr)
library(ggplot2)
library(ggrepel)

数据准备

case <- read.csv('frontier.csv',header = T)
SDI <- read.csv('SDI1990-2019modified.csv',header = T)
## 合并SDI值
frontier_data <- case %>% 
               filter(age=='Age-standardized' & 
               metric== 'Rate' &
               sex=='Both' &
               measure=='DALYs (Disability-Adjusted Life Years)') %>% 
               select(2,7,8) %>% rename(ASR = val)


frontier_SDI <- left_join(frontier_data,SDI,by=c('location','year'))

frontier思路 FDH方法是指按SDI、ASR后取每个位置的最小值,然后选取其中的点用水平线和垂直线连接,让所有的点都在这条线内部,但这个方法算出来的点是没有误差的,所以我们需要引入误差,采用重抽样方法来产生误差,同时我们需要找出每个点是不是super-efficient点,并进行排除。

具体思路是每个循环内:

1.有放回抽样形成一个新的数据

2.新数据内分别删除一个点画frontier线,按这个frontier线判断这个点是否落在这个线以内,如果是,不需要排除,如果判断这个点的值比frontier值低,认定是super-efficient点,需要进行排除;依次循环确认每个点是否是super-efficient点。

3.在新的数据里排除super-efficient点后,我们再对剩下的点计算每个SDI值的frontier。

以上方法为一个boostrap,1000次就是重复上述操作1000次

这里开始我们的假设:随着SDI的增大,疾病的ASR应该是逐渐降低的,因为SDI越高,国家的医疗水平更高,DALY率也越低

boostrap_DEA  <-  as.data.frame(matrix(nrow=0,ncol=6)) 
names(boostrap_DEA) <- c(names(frontier_SDI),'frontier','super')
boostrap_num <- 1 ### boostrap次数
for(interation_number in 1:boostrap_num){
  boot_sample <- frontier_SDI[sample(1:nrow(frontier_SDI),nrow(frontier_SDI),replace = TRUE),] ## 有放回抽样,抽样nrow(frontier_SDI)次
  boot_sample <- boot_sample %>% arrange(SDI,desc(ASR)) ## 我们对SDI从小到大排列、ASR从大到小排列进行排列
  boot_sample$super <- NA
  boot_sample$super[1] <- 0 ## super变量是用来判断是否是super-efficient
  for (i in 2:nrow(frontier_SDI)) {
    data <- boot_sample[-i,] ## 对每个点进行排除,判断是否是super-efficient
    data$frontier <- NA ## 产生frontier变量
    min <- data$ASR[1] ## ASR已从大到小排列,因此第一点是默认的frontier点
    for (j in 1:(i-1)) { ## 为了节约计算空间,我们只需要对剔除点以前的数据生成frontier值来判断这个点是否是super-efficient
      min <- ifelse(data$ASR[j]<min,data$ASR[j],min) 
      data$frontier[j] <- min} ## 判断该点是否比frontier值小,如果是,成为新的frontier
    boot_sample$super[i] <- ifelse(boot_sample[i,1]==boot_sample[i+1,1] & boot_sample[i,2]==boot_sample[i+1,2],0,  ifelse(boot_sample$ASR[i]<data$frontier[i-1],0,1)) ## 判断排除的点是否是super-efficient,两种情况,一种是排除的点和后面的点是同一个值,那就默认不是super-efficient;如果不和后一个点一样,就判断ASR和前一个点的frontier的大小,如果小于frontier,就认为是super-efficient
  }
  ## 排除super-efficient后计算每个点的frontier值
  boot_sample_exclude <- boot_sample[boot_sample$super==0,]
  min <- boot_sample_exclude$ASR[1]
  for (z in 1:nrow(boot_sample_exclude)){
    min <- ifelse(boot_sample_exclude$ASR[z]<min,boot_sample_exclude$ASR[z],min)
    boot_sample_exclude$frontier[z] <- min
  }
  boostrap_DEA <- rbind(boostrap_DEA,boot_sample_exclude)
  boostrap_DEA <-boostrap_DEA %>% group_by(location,year,ASR,SDI)%>%summarize(frontier=mean(frontier))
  print(interation_number)
}
课程示例来源 [1] 1
#save(boostrap_DEA,file = 'boostrap_DEA.Rdata')
load(file='boostrap_DEA.Rdata')
boostrap_DEA <- boostrap_DEA %>% 
                mutate(eff_diff = ASR - frontier)

boostrap_DEA_1990 <- boostrap_DEA %>% 
                    filter(year == 1990)

boostrap_DEA_2019 <- boostrap_DEA %>% 
                    filter(year == 2019) 

boostrap_DEA_2019$trend <- ifelse(boostrap_DEA_2019$ASR > boostrap_DEA_1990$ASR, "Increase",
                                          "Decrease")

绘制示例论文中的图片

制作图A

plotA <- ggplot(boostrap_DEA, aes(SDI,ASR)) + geom_point(aes(color = year),size=1.8)+ 
  scale_x_continuous(breaks = c(0,0.2,0.4,0.6,0.8,1.0),limits = c(0,1)) +
  scale_y_reverse() + ## y轴旋转
  scale_color_gradient(low='orange',high='darkgreen') + ## 连续性变量配色
  stat_smooth(data=boostrap_DEA, aes(SDI,frontier),colour='black',formula=y ~ poly(x, 1),
              stat = "smooth",method='loess',se=F,span=0.2,fullrange=T) + ## 绘制frontier线
  theme_bw()

制作图B

black <- boostrap_DEA_2019[order(boostrap_DEA_2019$eff_diff,decreasing = T),][1:15,]   ## 获取距离差最大的15个点
blue <- subset(boostrap_DEA_2019,SDI<0.5)[order(subset(boostrap_DEA_2019,SDI<0.5)$eff_diff),][1:5,]   ## 低SDI国家中距离差最小的5个国家
red <- subset(boostrap_DEA_2019,SDI>0.85)[order(subset(boostrap_DEA_2019,SDI>0.85)$eff_diff,decreasing = T),][1:5,] ## 高SDI国家中距离差最大的5个国家

plotB <- ggplot(boostrap_DEA_2019, aes(SDI,ASR)) + geom_point(aes(color = trend),size=2.5)+
  scale_x_continuous(breaks = c(0,0.2,0.4,0.6,0.8,1.0),limits = c(0,1)) + ## x轴标度设定
  scale_y_reverse() + ## y轴调转
  stat_smooth(data=boostrap_DEA, aes(SDI,frontier),colour='black',formula=y ~ poly(x, 1),
              stat = "smooth",method='loess',se=F,span=0.2,fullrange=T)  + ## 画frontier拟合线
  geom_text_repel(data=black,colour='black',aes(SDI,ASR, label = location),size=2.5,fontface= 'bold',max.overlaps = 160) + ##添加距离差最大的15个地区名称,并标记在对应的点上
  geom_text_repel(data=red,colour='darkred',aes(SDI,ASR, label = location),size=2.5,fontface= 'bold',max.overlaps = 160) + ##添加高SDI国家中距离差最大的5个国家,并标记在对应的点上,用红色字体现实
  geom_text_repel(data=blue,colour='darkblue',aes(SDI,ASR, label = location),size=2.5,fontface= 'bold',max.overlaps = 160)  + ##添加低SDI国家中距离差最小的5个国家,并标记在对应的点上,用蓝色字体现实
  theme_bw()

参考文献

  1. A Web Tool for Age-Period-Cohort Analysis of Cancer Incidence and Mortality Rates. Cancer Epidemiology
  2. Analysis of the Global Burden of Disease study highlights the global, regional, and national trends of chronic kidney disease epidemiology from 1990 to 2016
  3. Healthcare Access and Quality Index based on mortality from causes amenable to personal health care in 195 countries and territories, 1990-2015: a novel analysis from the Global Burden of Disease Study 2015