#' Make VEP.vcf files into R's data.frame format
#'
#' This function is to load and processing VEP.vcf file into R data.frame\cr
#'
#' @param vep.vcf.dir (default=NULL) : Directory path where VEP.vcf files are stored.\cr
#' @param Rdata (default='VCF.VEP.merged.Rdata') : Name of Rdata. If you provide name of Rdata file, Rdata with processed VEP will be generated.\cr
#' Otherwise, Rdata won't be generated.\cr
#' @param out.path (default=NULL) : Output path. If you don't provide out.path, the rdata will be stored under vep.dir
#' @param return2R (default=F) : Whether processed data should be returned in R environment.
#' @keywords VCF, VEP, read.depth
#' @export
#' @examples
#' # Suppose u have VCF and VEP files exactly same number
#' vep.vcf.dir='C:/Users/osj118/Downloads'
#' out.path='C:/Users/osj118/Downloads'
#' return2R=F
#' Rdata='VCF.VEP.merged.Rdata'
#' vcf.results=read_vep.vcf(vcf.dir=vcf.dir,vep.dir=vep.dir,out.path=NULL,Rdata='VCF.VEP.merged.Rdata')
#-----------------------------------------
# Attaching readDepth to VEP file
#-----------------------------------------
read_vep.vcf=function(vep.vcf.dir=NULL,out.path=NULL,Rdata='VCF.VEP.merged.Rdata',return2R=F,
file_pattern=NULL){
# Load the VCF files
# Get VCF files
if(is.null(file_pattern)){file_pattern='.vcf'}
vcf.files=list.files(path = vep.vcf.dir,pattern = file_pattern,full.names = T)
# Loading the VCF files and make compatible with VEP coordination #x=1
vcf.files2=plyr::llply(1:length(vcf.files),.progress = 'text',function(x){
cat('\r',vcf.files[x])
# Get number of lines containing '#' to skip the line
tmp=readLines(vcf.files[x],n = 1000)
skip.n=max(grep(tmp,pattern = '#'))
# Read the file and skip the line containing #
tmp2=data.table::fread(vcf.files[x],skip = skip.n-1,data.table = F)
colnames(tmp2)=gsub(colnames(tmp2),pattern = '#',replacement = "")
# Read info.col data
info.col=tmp[skip.n-1]
info.col2=unlist(strsplit(info.col,' Format: ',fixed = T))[2]
info.col3=unlist(strsplit(info.col2,'|',fixed = T))
info.col3=gsub(info.col3,pattern='\">',fixed = T,replacement = '')
# Process info column #k=1023 #k=43481
#grep(tmp2$INFO,pattern = 'Unknown',ignore.case = T)
vcf.col=c('CHROM','POS','ID','REF','ALT','QUAL','FILTER','FORMAT')
remove.col=c('BAM_EDIT','HGVSc','HGVSp','Existing_variation',
"GIVEN_REF","USED_REF",'Feature_type')
tmp3=plyr::ldply(1:nrow(tmp2),.progress='text',function(k){
#
k1=tmp2$INFO[k]
k1=gsub(k1,pattern = '|',fixed = T,replacement = '@|$')
k2=unlist(strsplit(k1,'|',fixed = T))
k3=k2[grep(k2,pattern='CSQ=',fixed=T):length(k2)]
k4=gsub(k3,pattern = '$@',replacement = '.',fixed = T)
k5=matrix(k4[2:length(k4)],ncol=32,byrow=T)
colnames(k5)=info.col3[-1]
# GET VCF col
z1=tmp2[k,c(which(colnames(tmp2) %in% vcf.col),ncol(tmp2))]
z2=data.table::rbindlist(replicate(nrow(k5),z1,simplify = F))
colnames(z2)[ncol(z2)]='INFO'
z2$sample=colnames(z1)[ncol(z1)]
# Combine tables
z3=as.data.frame(cbind(z2,k5),stringsAsFactors=F)
return(z3)
})
# Get canonical genes
tmp4=tmp3[which(tmp3$CANONICAL!='.'),]
tmp4=tmp4[grep(tmp4$Gene,pattern = 'ENSG'),]
for(i in 1:ncol(tmp4)){
tmp4[,i]=multi_sub(tmp4[,i],pattern = c('$','@'),
replacement = c('',''),partial = T)
}
tmp4$PolyPhen=unlist(plyr::llply(tmp4$PolyPhen,function(j){
j1=unlist(strsplit(j,',',fixed = T))[1]
return(j1)
}))
tmp4=tmp4[,setdiff(colnames(tmp4),remove.col)]
return(tmp4)
})
vcf.files.name=gsub(vcf.files,pattern = vep.vcf.dir,replacement = '')
vcf.files.name=gsub(vcf.files.name,pattern = '/',replacement = '',fixed = T)
names(vcf.files2)=vcf.files.name
#============================
# Return Rdata
#============================
if(!is.null(Rdata)){
message(paste0('Saving Rdata : ',Rdata))
if(is.null(out.path)){out.path=vep.vcf.dir}
assign(Rdata,vcf.files2)
save(list=ls(pattern = Rdata),file = paste0(out.path,'/',Rdata))
}
if(return2R){
return(vcf.files2)
}
}
카테고리 없음
read_vep.vcf.r
728x90
반응형
728x90
반응형