APC网页版数据处理
格式要求:
1.period的间隔长度需要和age的长度保持一致:比如年龄为5年一组,period也应该是5年为一组
2.数据格式年龄为行,period为列,且疾病数据和人口学数据交替排列
读取相关R包
library(magrittr)
library(dplyr)
library(tidyverse)
library(data.table)读取人口学数据
dir <- "path/to/GBD_Population"
files <- list.files(dir,
pattern = ".CSV",
full.names = T)
GBD_population <- map_dfr(files,fread) %>% as.data.frame()## 对人口学数据进行整理
## 取我们需要的列名
GBD_population <- GBD_population %>%
select(location_name,sex_name,age_group_name,year_id,metric_name,val)
## 采用case_when函数 对sex的标签进行重新命名
GBD_population <- GBD_population %>% mutate(sex_name = case_when(
sex_name == "both" ~ "Both",
sex_name == "male" ~ "Male",
sex_name == "female" ~ "Female"))
names(GBD_population) <- c("location","sex","age","year","metric","val")整理疾病数据
## 对疾病数据进行整理
case <- read.csv('糖尿病.csv')
# 按照下面条件筛选数据
DM <- case %>%
filter(measure=="Prevalence") %>%
filter(cause!="Diabetes mellitus") %>%
filter(metric=="Number") %>%
filter(age!="All Ages")匹配筛选人口数据
## 合并疾病和人口学数据
df <- left_join(DM,GBD_population,by=c("location","sex","age","metric","year"))
colnames(df)[8] <- "prevalence"
colnames(df)[11] <- "population"
df <- df %>%
mutate(population=round(population,digits = 0)) %>%
mutate(prevalence=round(prevalence,digits = 0)) %>%
select(location,sex,age,cause,year,prevalence,population) %>%
arrange(location,sex,age,cause,year) # 筛选不同年龄组的数据
ages <- c("15 to 19","20 to 24","25 to 29","30 to 34", "35 to 39", "40 to 44", "45 to 49", "50 to 54", "55 to 59",
"60 to 64", "65 to 69", "70 to 74", "75 to 79", "80 to 84", "85 to 89", "90 to 94",'95 plus')
df <- df %>% filter(prevalence > 0) %>% #删除缺失值
mutate(age=sub('95 plus',replacement = '95 to 99', age)) %>%
mutate(age=factor(age,levels = ages,ordered = T))
## 对year重新命名
df <- df %>% mutate(year=case_when(
year<1995 ~ "1990~1994",
year<2000 & year>=1995 ~ "1995~1999",
year<2005 & year>=2000 ~ "2000~2004",
year<2010 & year>=1995 ~ "2005~2009",
year<2015 & year>=2000 ~ "2010~2014",
year<2020 & year>=2015 ~ "2015~2019"))
df2 <- df %>%
group_by(location,sex,cause,age,year) %>%
mutate(case=round(mean(prevalence),digits = 0)) %>%
mutate(population2=round(mean(population),digits = 0)) %>%
select(location,sex,age,cause,year,case,population2) %>%
unique()## 挑选除全球、男性、不同年龄段的、不同年份的数据
## 将数据转换成行为年龄,列为年的数据
apc_case <- df2 %>% filter(location=="Global" & sex== "Male")
apc_case_preval <- reshape2::dcast(apc_case,age~year,value.var = 'case') %>%
arrange(age) %>%
select(-1)
## 将数据转换成行为年龄,列为年的数据
apc_case_pop <- reshape2::dcast(apc_case,age~year,value.var = 'population2') %>%
arrange(age) %>%
select(-1)## 将数据整理成 APC网页版需要的数据格式:疾病和人口数据交替排列
for (i in 1:(ncol(apc_case_preval)+ncol(apc_case_pop))){
if(i == 1){
result <-apc_case_preval[,i] %>% as.data.frame()}
else{
if(i%%2==0){
result <- cbind(result,apc_case_pop[,ceiling(i/2)])}
else{
result <- cbind(result,apc_case_preval[,ceiling(i/2)])}
}
}
write.csv(result,'apc_web_data.csv',row.names = F,col.names = F)APC网页版演示
请看视频 APC网页版
数据展示
APC_output <- readRDS("path/to/GBD")数据结果意义
检验结果
常用图形绘制
## long age curve
long_age <- APC_output[["LongAge"]] %>% as.data.frame()
ggplot(data=long_age,aes(x=Age,y=Rate)) +
geom_line(color='Red') +
geom_ribbon(aes(Age,ymin=CILo,ymax=CIHi,fill='Red'),alpha=0.1) +
theme_bw()## CohortRR
CohortRR <- APC_output[["CohortRR"]] %>% as.data.frame()
ggplot(data=CohortRR,aes(x=Cohort,y=`Rate Ratio`)) +
geom_line(color='Red') +
geom_ribbon(aes(Cohort,ymin=CILo,ymax=CIHi,fill='Red'),alpha=0.1) +
theme_bw() +
geom_hline(yintercept = 1,linetype = 2, color= 'black') +
geom_vline(xintercept = CohortRR[which(CohortRR$`Rate Ratio`==1),1],
linetype = 2, color= 'black')## periodRR
PeriodRR <- APC_output[["PeriodRR"]] %>% as.data.frame()
ggplot(data=PeriodRR,aes(x=Period,y=`Rate Ratio`)) +
geom_line(color='Red') +
geom_ribbon(aes(Period,ymin=CILo,ymax=CIHi,fill='Red'),alpha=0.1) +
theme_bw() +
geom_hline(yintercept = 1,linetype = 2, color= 'black') +
geom_vline(xintercept = PeriodRR[which(PeriodRR$`Rate Ratio`==1),1],
linetype = 2, color= 'black')### Local drift and net drift
Local <- APC_output[["LocalDrifts"]] %>% as.data.frame()
Net <- APC_output[["NetDrift"]] %>% as.data.frame()
ggplot(data=Local,aes(x=Age,y=`Percent per Year`)) +
geom_line(color='Red') +
geom_ribbon(aes(Age,ymin=CILo,ymax=CIHi,fill='Red'),alpha=0.1) +
theme_bw() +
geom_hline(yintercept = 0,linetype = 2, color= 'black') +
geom_hline(data= Net,aes(yintercept = `Net Drift (%/year)`),color='Blue',
linetype = 1) +
geom_hline(data= Net,aes(yintercept = `CI Lo`),color='Blue',
linetype = 2) +
geom_hline(data= Net,aes(yintercept = `CI Hi`),color='Blue',
linetype = 2)