## APC 
library(magrittr)
library(dplyr)
library(tidyverse)
library(data.table)
source('source_apc.R')

dir <- "path/to/GBD_Population"
files <- list.files(dir,
                    pattern = ".CSV",
                    full.names = T)
GBD_population <- map_dfr(files,fread) %>% as.data.frame()

## 对人口学数据进行整理
## 取我们需要的列名
GBD_population <- GBD_population %>% 
  select(location_name,sex_name,age_group_name,year_id,metric_name,val) 
## 采用case_when函数 对sex的标签进行重新命名
GBD_population <- GBD_population %>% mutate(sex_name = case_when(
  sex_name == "both"  ~ "Both",
  sex_name == "male"  ~ "Male",
  sex_name == "female" ~ "Female"))

names(GBD_population) <- c("location","sex","age","year","metric","val")

## 对疾病数据进行整理
case <- read.csv('糖尿病.csv')
# 按照下面条件筛选数据
DM <- case %>%
  filter(measure=="Prevalence") %>% 
  filter(cause!="Diabetes mellitus") %>%
  filter(metric=="Number") %>% 
  filter(age!="All Ages")

## 合并疾病和人口学数据
df <- left_join(DM,GBD_population,by=c("location","sex","age","metric","year"))

colnames(df)[8] <- "prevalence"
colnames(df)[11] <- "population"


df <- df %>% 
  mutate(population=round(population,digits = 0)) %>% 
  mutate(prevalence=round(prevalence,digits = 0)) %>% 
  select(location,sex,age,cause,year,prevalence,population) %>% 
  arrange(location,sex,age,cause,year) 


# 筛选不同年龄组的数据
ages <- c("15 to 19","20 to 24","25 to 29","30 to 34", "35 to 39", "40 to 44", "45 to 49", "50 to 54", "55 to 59",        
          "60 to 64", "65 to 69", "70 to 74", "75 to 79", "80 to 84", "85 to 89", "90 to 94",'95 plus')  

df <- df %>% filter(prevalence > 0 &
                      age %in% ages) %>% #删除缺失值
  mutate(age=sub('95 plus',replacement = '95 to 99', age)) %>% 
  mutate(age=factor(age,levels = ages,ordered = T)) 

## 对year重新命名
df <- df %>% mutate(year=case_when(
  year<1995 ~ "1990~1994",
  year<2000 & year>=1995 ~ "1995~1999",
  year<2005 & year>=2000 ~ "2000~2004",
  year<2010 & year>=1995 ~ "2005~2009",
  year<2015 & year>=2000 ~ "2010~2014",
  year<2020 & year>=2015 ~ "2015~2019"))

df2 <- df %>% 
  group_by(location,sex,cause,age,year) %>% 
  mutate(case=round(mean(prevalence),digits = 0)) %>% 
  mutate(population2=round(mean(population),digits = 0)) %>% 
  select(location,sex,age,cause,year,case,population2) %>% 
  unique()

## 挑选除全球、男性、不同年龄段的、不同年份的数据
## 将数据转换成行为年龄，列为年的数据
apc_case <- df2 %>% filter(location=="Global" & sex== "Male")
apc_case_preval <- reshape2::dcast(apc_case,age~year,value.var = 'case') %>% 
  arrange(age) %>% 
  select(-1)
## 将数据转换成行为年龄，列为年的数据
apc_case_pop <- reshape2::dcast(apc_case,age~year,value.var = 'population2') %>% 
  arrange(age) %>% 
  select(-1)
head(apc_case_preval)


## 将数据整理成 APC网页版需要的数据格式:疾病和人口数据交替排列
for (i in 1:(ncol(apc_case_preval)+ncol(apc_case_pop))){
  if(i == 1){
    result <-apc_case_preval[,i] %>% as.data.frame()}
  else{
    if(i%%2==0){
      result <- cbind(result,apc_case_pop[,ceiling(i/2)])}
    else{
      result <- cbind(result,apc_case_preval[,ceiling(i/2)])}
  }
}

## APC模型进一步处理
R <- prepare_rates(result,
                   StartYear=1990,StartAge=15,Interval=5,
                   fullname='',description='') 
## APC模型计算
M <- apc2fit(R)
## 画图
plot.apc1(M)
pdf(file = 'plot.pdf')
plot.apc1(M)


##  Decomposition
library(dplyr)
library(data.table)
library(purrr)
library(tidyr)
library(ggplot2)
library(ggsci)

## 读取疾病数据
case <- read.csv('EC_predict.csv')
ages <-  c("20 to 24", "25 to 29",
           "30 to 34", "35 to 39", "40 to 44", "45 to 49", "50 to 54", "55 to 59",
           "60 to 64", "65 to 69", "70 to 74", "75 to 79", "80 to 84", "85 to 89",
           "90 to 94", "95 plus")
## sub函数：将年龄的label,比如"15-19 years"转换成“15 to 19”,以匹配人口学数据和疾病数据
case <- case %>% mutate(age=sub('-',replacement = ' to ', age)) %>% 
  mutate(age=sub(' years',replacement = '', age)) %>% 
  mutate(age=sub(' year',replacement = '', age)) %>% 
  mutate(age=sub('95\\+',replacement = '95 plus', age)) %>% 
  mutate(age=sub('Age to standardized',replacement = 'Age-standardized', age)) %>% 
  mutate(age=sub('<5',replacement = 'Under 5', age)) %>% 
  filter(val>0) %>% 
  filter(age %in% ages) %>% 
  mutate(age = factor(age,levels = ages, ordered = T))
str(case)

## 读取人口学数据
dir <- "path/to/GBD_Population"
files <- list.files(dir,
                    pattern = ".CSV",
                    full.names = T)
## 这里我们只需要2年：2019和1990年，我们需要比较2019年和1990的情况
GBD_population <- map_dfr(files[c(1,30)],fread) %>% as.data.frame()

GBD_population <- GBD_population %>% mutate(sex_name = case_when(
  sex_name == "both"  ~ "Both",
  sex_name == "male"  ~ "Male",
  sex_name == "female" ~ "Female"))

population <- GBD_population %>% 
  select("location_name","sex_name","year_id","age_group_name","val") %>% 
  filter(age_group_name %in% ages) %>% 
  mutate(age_group_name = factor(age_group_name, levels = ages,ordered = T))

str(population)

## 这里，我们以全球男性发病数据为例，进行decomposition的讲解
## 获取1990年不同年龄段的占比 a1990
Global_population_1990 <- population %>% 
  filter(location_name == 'Global' & 
           year_id == 1990 &
           sex_name == 'Male') %>% 
  arrange(age_group_name) ### 按年龄从小到大排序

Global_1990 <- sum(Global_population_1990$val)
Global_population_1990$percent <- Global_population_1990$val/Global_1990 ### 获取1990年不同年龄段的占比

## 获取2019年不同年龄段的占比
Global_population_2019 <- population %>% 
  filter(location_name == 'Global' & 
           year_id == 2019 &
           sex_name == 'Male') %>% 
  arrange(age_group_name) ### 按年龄从小到大排序

Global_2019 <- sum(Global_population_2019$val)
Global_population_2019$percent <- Global_population_2019$val/Global_2019 ### 获取2019年不同年龄段的占比

### a代表年龄占比，p代表总人群数
a_1990 <- Global_population_1990$percent
a_2019 <- Global_population_2019$percent
p_1990 <- Global_1990
p_2019 <- Global_2019

## 获取率的数据
case_1990 <- case %>% filter(year == 1990 &
                               sex == 'Male' &
                               location == 'Global' &
                               metric == 'Rate' &
                               measure == 'Incidence') %>% 
                            arrange(age)

case_2019 <- case %>% filter(year == 2019 &
                               sex == 'Male' &
                               location == 'Global' &
                               metric == 'Rate' &
                               measure == 'Incidence') %>% 
                               arrange(age)

r_1990 <- as.numeric(case_1990$val)/10^5 ## 单位转换
r_2019 <- as.numeric(case_2019$val)/10^5 ## 单位转换


## 根据公式计算
a_effect <- round((sum(a_2019*p_1990*r_1990) + sum(a_2019*p_2019*r_2019))/3 + 
                    (sum(a_2019*p_1990*r_2019) + sum(a_2019*p_2019*r_1990))/6 -
                    (sum(a_1990*p_1990*r_1990) + sum(a_1990*p_2019*r_2019))/3 -
                    (sum(a_1990*p_1990*r_2019) + sum(a_1990*p_2019*r_1990))/6,3)

p_effect <- round((sum(a_1990*p_2019*r_1990) + sum(a_2019*p_2019*r_2019))/3 + 
                    (sum(a_1990*p_2019*r_2019) + sum(a_2019*p_2019*r_1990))/6 -
                    (sum(a_1990*p_1990*r_1990) + sum(a_2019*p_1990*r_2019))/3 -
                    (sum(a_1990*p_1990*r_2019) + sum(a_2019*p_1990*r_1990))/6,3)

r_effect <- round((sum(a_1990*p_1990*r_2019) + sum(a_2019*p_2019*r_2019))/3 + 
                    (sum(a_1990*p_2019*r_2019) + sum(a_2019*p_1990*r_2019))/6 -
                    (sum(a_1990*p_1990*r_1990) + sum(a_2019*p_2019*r_1990))/3 -
                    (sum(a_1990*p_2019*r_1990) + sum(a_2019*p_1990*r_1990))/6,3)

overll_differ <- round(a_effect + p_effect + r_effect,3)


## 循环语句
## decomposition calculation
decomposition_name <- c('location','overll_difference','a_effect','p_effect','r_effect','a_percent',
                        'p_percent','r_percent')
decomposition_data <- as.data.frame(matrix(nrow=0,ncol=length(decomposition_name))) 
names(decomposition_data) <- decomposition_name

loc_name <- c("Global","High SDI","High-middle SDI","Middle SDI","Low-middle SDI","Low SDI")

a= loc_name[1]
for (a in loc_name) {
  Global_population_1990 <- population %>% 
    filter(location_name == a & 
             year_id == 1990 &
             sex_name == 'Male') %>% 
    arrange(age_group_name) ### 按年龄从小到大排序
  
  Global_1990 <- sum(Global_population_1990$val) ## 计算1990人口总数
  Global_population_1990$percent <- Global_population_1990$val/Global_1990 ### 获取1990年不同年龄段的占比
  
  ## 获取2019年不同年龄段的占比
  Global_population_2019 <- population %>% 
    filter(location_name == a & 
             year_id == 2019 &
             sex_name == 'Male') %>% 
    arrange(age_group_name) ### 按年龄从小到大排序
  
  Global_2019 <- sum(Global_population_2019$val) ## 计算2019人口总数
  Global_population_2019$percent <- Global_population_2019$val/Global_2019 ### 获取2019年不同年龄段的占比
  
  ### a代表年龄占比，p代表总人群数
  a_1990 <- Global_population_1990$percent
  a_2019 <- Global_population_2019$percent
  p_1990 <- Global_1990
  p_2019 <- Global_2019
  
  ## 获取率的数据
  case_1990 <- case %>% filter(year == 1990 &
                                 sex == 'Male' &
                                 location == a &
                                 metric == 'Rate' &
                                 measure == 'Incidence') %>% 
    arrange(age)
  
  case_2019 <- case %>% filter(year == 2019 &
                                 sex == 'Male' &
                                 location == a &
                                 metric == 'Rate' &
                                 measure == 'Incidence') %>% 
    arrange(age)
  
  r_1990 <- as.numeric(case_1990$val)/10^5 ## 单位转换
  r_2019 <- as.numeric(case_2019$val)/10^5 ## 单位转换
  
  ##### 根据公式计算
  a_effect <- round((sum(a_2019*p_1990*r_1990) + sum(a_2019*p_2019*r_2019))/3 + 
                      (sum(a_2019*p_1990*r_2019) + sum(a_2019*p_2019*r_1990))/6 -
                      (sum(a_1990*p_1990*r_1990) + sum(a_1990*p_2019*r_2019))/3 -
                      (sum(a_1990*p_1990*r_2019) + sum(a_1990*p_2019*r_1990))/6,3)
  
  p_effect <- round((sum(a_1990*p_2019*r_1990) + sum(a_2019*p_2019*r_2019))/3 + 
                      (sum(a_1990*p_2019*r_2019) + sum(a_2019*p_2019*r_1990))/6 -
                      (sum(a_1990*p_1990*r_1990) + sum(a_2019*p_1990*r_2019))/3 -
                      (sum(a_1990*p_1990*r_2019) + sum(a_2019*p_1990*r_1990))/6,3)
  
  r_effect <- round((sum(a_1990*p_1990*r_2019) + sum(a_2019*p_2019*r_2019))/3 + 
                      (sum(a_1990*p_2019*r_2019) + sum(a_2019*p_1990*r_2019))/6 -
                      (sum(a_1990*p_1990*r_1990) + sum(a_2019*p_2019*r_1990))/3 -
                      (sum(a_1990*p_2019*r_1990) + sum(a_2019*p_1990*r_1990))/6,3)
  
  overll_differ <- round(a_effect + p_effect + r_effect,2)
  
  a_percent <- round(a_effect/overll_differ*100,2)
  p_percent <- round(p_effect/overll_differ*100,2)
  r_percent <- round(r_effect/overll_differ*100,2)
  
  temp <- c(a,overll_differ,a_effect,p_effect,r_effect,a_percent,
            p_percent,r_percent) %>% t() %>% as.data.frame()
  names(temp) <- decomposition_name
  decomposition_data <- rbind(decomposition_data,temp)
}


data <- read.csv("EC_predict.csv")
num_1990 <- data %>% filter(age == 'All ages' &
                         sex == 'Male' &
                         metric == 'Number' &
                           year == 1990 &
                           measure == 'Incidence') %>% 
                          select(location, val) %>% 
                          rename(val_1990 = val)

num_2019 <- data %>% filter(age == 'All ages' &
                              sex == 'Male' &
                              metric == 'Number' &
                              year == 2019 &
                              measure == 'Incidence') %>% 
                            select(location, val) %>% 
                           rename(val_2019 = val)


decomposition_data <- left_join(decomposition_data,num_1990, by = 'location') %>% 
                       left_join(num_2019, by = 'location')

decomposition_data$diff <- decomposition_data$val_2019 - decomposition_data$val_1990
decomposition_data[,2:11] <- decomposition_data[,2:11] %>% apply(c(1,2),as.numeric)
round(decomposition_data$diff) == round(decomposition_data$overll_difference)


## 作图
names(decomposition_data)[2:5] <- c('Overll difference','Aging','Population','Epidemiological change')

decomposition_plot <- decomposition_data[c(1:5)] %>% 
  pivot_longer(3:5,
               names_to = "varname",
               values_to = "value") %>% 
  mutate(value=as.numeric(value)) %>% 
  mutate(location=factor(location,levels = loc_name,ordered = T))

p <- ggplot(decomposition_plot, aes(x= location,y=value, fill= varname)) +
  geom_bar(stat="identity",position = "stack") +
  coord_flip() + scale_fill_nejm() + 
  theme_bw()


decomposition_data$`Overll difference` <- as.numeric(decomposition_data$`Overll difference`)
plot <- p+ geom_point(data=decomposition_data, mapping=aes(x=location,y= `Overll difference`),fill='black',color='black',size=3)


## frontier analysis
library(dplyr)
library(data.table)
library(purrr)
library(tidyr)
library(ggplot2)
library(ggrepel)

case <- read.csv('frontier.csv',header = T)
SDI <- read.csv('SDI1990-2019modified.csv',header = T)

## 合并SDI值
frontier_data <- case %>% 
  filter(age=='Age-standardized' & 
           metric== 'Rate' &
           sex =='Both' &
           measure=='DALYs (Disability-Adjusted Life Years)') %>% 
  select(2,7,8) %>% rename(ASR = val)


frontier_SDI <- left_join(frontier_data,SDI,by=c('location','year'))

boostrap_DEA  <-  as.data.frame(matrix(nrow=0,ncol=6)) 
names(boostrap_DEA) <- c(names(frontier_SDI),'frontier','super')
boostrap_num <- 1 ### boostrap次数
for(interation_number in 1:boostrap_num){
  boot_sample <- frontier_SDI[sample(1:nrow(frontier_SDI),nrow(frontier_SDI),replace = TRUE),] ## 有放回抽样，抽样nrow(frontier_SDI)次
  boot_sample <- boot_sample %>% arrange(SDI,desc(ASR))  ## 我们对SDI从小到大排列、ASR从大到小排列进行排列
  boot_sample$super <- NA
  boot_sample$super[1] <- 0 ## super变量是用来判断是否是super-efficient
  for (i in 2:nrow(frontier_SDI)) {
    data <- boot_sample[-i,] ## 对每个点进行排除，判断是否是super-efficient
    data$frontier <- NA ## 产生frontier变量
    min <- data$ASR[1] ## ASR已从大到小排列，因此第一点是默认的frontier点
    for (j in 1:(i-1)) { ## 为了节约计算空间，我们只需要对剔除点以前的数据生成frontier值来判断这个点是否是super-efficient
      min <- ifelse(data$ASR[j]<min,data$ASR[j],min) 
      data$frontier[j] <- min} ## 判断该点是否比frontier值小，如果是，成为新的frontier
    boot_sample$super[i] <- ifelse(boot_sample[i,1]==boot_sample[i+1,1] & boot_sample[i,2]==boot_sample[i+1,2],0,  
          ifelse(boot_sample$ASR[i]<data$frontier[i-1],0,1)) ## 判断排除的点是否是super-efficient,两种情况，一种是排除的点和后面的点是同一个值，那就默认不是super-efficient；如果不和后一个点一样，就判断ASR和前一个点的frontier的大小，如果小于frontier,就认为是super-efficient
  }
  ## 排除super-efficient后计算每个点的frontier值
  boot_sample_exclude <- boot_sample[boot_sample$super==0,]
  min <- boot_sample_exclude$ASR[1]
  for (z in 1:nrow(boot_sample_exclude)){
    min <- ifelse(boot_sample_exclude$ASR[z]<min,boot_sample_exclude$ASR[z],min)
    boot_sample_exclude$frontier[z] <- min
  }
  boostrap_DEA <- rbind(boostrap_DEA,boot_sample_exclude)
  boostrap_DEA <-boostrap_DEA %>% 
                group_by(location,year,ASR,SDI) %>%
                summarize(frontier=mean(frontier))
  print(interation_number)
}

load(file='boostrap_DEA.Rdata')
boostrap_DEA <- boostrap_DEA %>% 
  mutate(eff_diff = ASR - frontier)

boostrap_DEA_1990 <- boostrap_DEA %>% 
  filter(year == 1990)

boostrap_DEA_2019 <- boostrap_DEA %>% 
  filter(year == 2019) 

boostrap_DEA_2019$trend <- ifelse(boostrap_DEA_2019$ASR > boostrap_DEA_1990$ASR, "Increase",
                                  "Decrease")

plotA <- ggplot(boostrap_DEA, aes(SDI,ASR)) + geom_point(aes(color = year),size=1.8)+ 
  scale_x_continuous(breaks = c(0,0.2,0.4,0.6,0.8,1.0),limits = c(0,1)) +
  scale_y_reverse() + ## y轴旋转
  scale_color_gradient(low='orange',high='darkgreen') + ## 连续性变量配色
  stat_smooth(data=boostrap_DEA, aes(SDI,frontier),colour='black',formula=y ~ poly(x, 1),
              stat = "smooth",method='loess',se=F,span=0.2,fullrange=T) + ## 绘制frontier线
  theme_bw()


black <- boostrap_DEA_2019[order(boostrap_DEA_2019$eff_diff,decreasing = T),][1:15,]   ## 获取距离差最大的15个点
blue <- subset(boostrap_DEA_2019,SDI<0.5)[order(subset(boostrap_DEA_2019,SDI<0.5)$eff_diff),][1:5,]   ## 低SDI国家中距离差最小的5个国家
red <- subset(boostrap_DEA_2019,SDI>0.85)[order(subset(boostrap_DEA_2019,SDI>0.85)$eff_diff,decreasing = T),][1:5,] ## 高SDI国家中距离差最大的5个国家

plotB <- ggplot(boostrap_DEA_2019, aes(SDI,ASR)) + geom_point(aes(color = trend),size=2.5)+
  scale_x_continuous(breaks = c(0,0.2,0.4,0.6,0.8,1.0),limits = c(0,1)) + ## x轴标度设定
  scale_y_reverse() + ## y轴调转
  stat_smooth(data=boostrap_DEA, aes(SDI,frontier),colour='black',formula=y ~ poly(x, 1),
              stat = "smooth",method='loess',se=F,span=0.2,fullrange=T)  + ## 画frontier拟合线
  geom_text_repel(data=black,colour='black',aes(SDI,ASR, label = location),size=2.5,fontface= 'bold',max.overlaps = 160) + ##添加距离差最大的15个地区名称，并标记在对应的点上
  geom_text_repel(data=red,colour='darkred',aes(SDI,ASR, label = location),size=2.5,fontface= 'bold',max.overlaps = 160) + ##添加高SDI国家中距离差最大的5个国家，并标记在对应的点上，用红色字体现实
  geom_text_repel(data=blue,colour='darkblue',aes(SDI,ASR, label = location),size=2.5,fontface= 'bold',max.overlaps = 160)  + ##添加低SDI国家中距离差最小的5个国家，并标记在对应的点上，用蓝色字体现实
  theme_bw()