a1_merged_missense_maf_with_hotspot_annot20181203.r
a1v20181203.1710_tcga_somatic_missense_MAF_summary.Rdata
#===========================================
# This script is to make missense variant file for Single nucleotide variants
# Its output file will not evaluate other variants including nonsense mutation and frameshit
# contatining informations are gene, entrezID, protein position, ref aminoacid/alt aminoacid
#===========================================
library(osjpkg);install_and_attach_lib()
# Load TCGA maf files
# Set workdir
setwd('/mnt/bigHDD/resource/public/TCGA/MAF')
load.data20181130.1350()
# Combine tables from different tissue maf files
# attach number of tumors for each variations and percents
simple_maf=merge.table.with.statics_for_variants20181203.1456(x=tcga.maf.files)
# Annotate hotspot information from 2016/2017 data used in cbioportal
simple_maf2=annotate_hotspots_2016.and.2017_20181203.1704(x=simple_maf,y=hotspot)
# Export the data
setwd('/mnt/bigHDD/resource/public/TCGA/MAF')
tcga_missense_somatic_maf20181203=simple_maf2
save(tcga_missense_somatic_maf20181203,file = 'a1v20181203.1710_tcga_somatic_missense_MAF_summary.Rdata')
# Functions
if(T){
#============================
# Load TCGA MAF files
#============================
load.data20181130.1350=function(){
wd=getwd()
message('load MAF files')
maf.files=list.files(pattern = 'TCGA')
maf.files=maf.files[grep(maf.files,pattern = '.txt')]
maf.files2=plyr::llply(1:length(maf.files),.progress = 'text',function(j){
tmp=data.table::fread(maf.files[j],data.table = F)
tmp2=tmp[,c('Hugo_Symbol','Entrez_Gene_Id',"Consequence", "HGVSp_Short")]
tmp2=tmp2[grep(tmp2$Consequence,pattern = 'missense',ignore.case = T),]
# Exclude genes having entrezID==0
tmp2=tmp2[tmp2$Entrez_Gene_Id!=0,]
tmp2$protein_consequence=gsub(paste0(tmp2$Hugo_Symbol,'',tmp2$HGVSp_Short),pattern = 'p.',replacement='@',fixed=T)
return(tmp2)
})
names(maf.files2)=maf.files
tcga.maf.files<<-maf.files2
# Load hotspot data
setwd('/mnt/bigHDD/resource/public/TCGA/Hotspot/')
hotspot<<-data.table::fread('hotspot_merged2016.2017.txt',data.table=F)
# Return to the workdir
setwd(wd)
}
#============================
# Merge table and annotate statics
#============================
merge.table.with.statics_for_variants20181203.1456=function(x=tcga.maf.files){
# Make matrix for the variants
# Get variant types and variant types for each tumor tissue
message('Get numbers for each variant in each tumor')
variants=plyr::ldply(1:length(x),.progress = 'text',function(j){
x2=x[[j]]
tmp=paste0(x2$Entrez_Gene_Id,'@',x2$protein_consequence)
tmp2=table(tmp)
tmp3=data.frame('variants'=names(tmp2),'tumor_num'=paste0(names(x)[j],'(',tmp2,')'))
return(tmp3)
})
variants=as.data.frame.matrix(variants)
# Shrink the table
# Get duplicated variants
message('Merge the results from different tumor tissues')
dups=unique(variants$variants[duplicated(variants$variants)])
# Get duplicateds variants
uniq=variants[-which(variants$variants %in% dups),]
dupl=as.data.table(variants[which(variants$variants %in% dups),])
dupl=dupl[,lapply(.SD,function(k){paste(collapse=',',unique(k))}),by='variants']
dupl=as.data.frame.matrix(dupl)
# Combine the matrix
variants2=as.data.frame.matrix(rbind(uniq,dupl))
# Remove TCGA_ and .txt
variants2$tumor_num=gsub(variants2$tumor_num,pattern = 'TCGA_',replacement = "")
variants2$tumor_num=gsub(variants2$tumor_num,pattern = '.txt',replacement = "",fixed = T)
# Entrez_gene_Id and genesymbols
tmp=matrix(unlist(strsplit(variants2$variants,'@')),byrow = T,ncol = 3)
variants2=cbind(tmp,variants2)
colnames(variants2)[1:3]=c('entrez','symbol','protein_change')
variants2$protein_variants=paste0(variants2$symbol,'@',variants2$protein_change)
variants2=variants2[,c(1:3,5)]
return(variants2)
}
#============================
# Merge table and annotate statics
#============================
annotate_hotspots_2016.and.2017_20181203.1704=function(x=simple_maf,y=hotspot){
x$merge_key=paste0(x$symbol,'@',x$protein_change)
y$merge_key=paste0(y$Hugo_Symbol,'@',y$Reference_Amino_Acid,y$Amino_Acid_Position,y$Variant_Amino_Acid)
wh=which(x$merge_key %in% y$merge_key)
x$hotspot=NA
x$hotspot[wh]='Hotspot'
x=x[,c("entrez","symbol","protein_change","tumor_num","hotspot")]
return(x)
}
}
'Bioinformatics(생정보학)' 카테고리의 다른 글
Pharos: Target list DB (0) | 2021.11.19 |
---|---|
WES alignment pipeline (0) | 2019.04.11 |
VEP variant location (0) | 2018.11.22 |
VEP for loop bash file (0) | 2018.11.14 |
Cancer subclone calculation (0) | 2018.10.02 |