본문 바로가기

Bioinformatics(생정보학)

TCGA somatic maf files

728x90
반응형

a1_merged_missense_maf_with_hotspot_annot20181203.r

a1v20181203.1710_tcga_somatic_missense_MAF_summary.Rdata

#===========================================

# This script is to make missense variant file for Single nucleotide variants

# Its output file will not evaluate other variants including nonsense mutation and frameshit

# contatining informations are gene, entrezID, protein position, ref aminoacid/alt aminoacid

#===========================================

library(osjpkg);install_and_attach_lib()

# Load TCGA maf files

# Set workdir

setwd('/mnt/bigHDD/resource/public/TCGA/MAF')

load.data20181130.1350()


# Combine tables from different tissue maf files

# attach number of tumors for each variations and percents

simple_maf=merge.table.with.statics_for_variants20181203.1456(x=tcga.maf.files)


# Annotate hotspot information from 2016/2017 data used in cbioportal

simple_maf2=annotate_hotspots_2016.and.2017_20181203.1704(x=simple_maf,y=hotspot)


# Export the data

setwd('/mnt/bigHDD/resource/public/TCGA/MAF')

tcga_missense_somatic_maf20181203=simple_maf2

save(tcga_missense_somatic_maf20181203,file = 'a1v20181203.1710_tcga_somatic_missense_MAF_summary.Rdata')

# Functions

if(T){

  #============================

  # Load TCGA MAF files

  #============================

  load.data20181130.1350=function(){

    wd=getwd()

    message('load MAF files')

    maf.files=list.files(pattern = 'TCGA')

    maf.files=maf.files[grep(maf.files,pattern = '.txt')]

    maf.files2=plyr::llply(1:length(maf.files),.progress = 'text',function(j){

      tmp=data.table::fread(maf.files[j],data.table = F)

      tmp2=tmp[,c('Hugo_Symbol','Entrez_Gene_Id',"Consequence", "HGVSp_Short")]

      tmp2=tmp2[grep(tmp2$Consequence,pattern = 'missense',ignore.case = T),]

      # Exclude genes having entrezID==0

      tmp2=tmp2[tmp2$Entrez_Gene_Id!=0,]

      tmp2$protein_consequence=gsub(paste0(tmp2$Hugo_Symbol,'',tmp2$HGVSp_Short),pattern = 'p.',replacement='@',fixed=T)

      return(tmp2)

    })

    names(maf.files2)=maf.files

    tcga.maf.files<<-maf.files2

    

    # Load hotspot data

    setwd('/mnt/bigHDD/resource/public/TCGA/Hotspot/')

    hotspot<<-data.table::fread('hotspot_merged2016.2017.txt',data.table=F)

    

    # Return to the workdir

    setwd(wd)

  }

  

#============================

# Merge table and annotate statics

#============================

  merge.table.with.statics_for_variants20181203.1456=function(x=tcga.maf.files){

    # Make matrix for the variants

    # Get variant types and variant types for each tumor tissue

    message('Get numbers for each variant in each tumor')

    variants=plyr::ldply(1:length(x),.progress = 'text',function(j){

      x2=x[[j]]

      tmp=paste0(x2$Entrez_Gene_Id,'@',x2$protein_consequence)

      tmp2=table(tmp)

      tmp3=data.frame('variants'=names(tmp2),'tumor_num'=paste0(names(x)[j],'(',tmp2,')'))

      return(tmp3)

    })

    variants=as.data.frame.matrix(variants)

    # Shrink the table

    # Get duplicated variants

    message('Merge the results from different tumor tissues')

    dups=unique(variants$variants[duplicated(variants$variants)])

    # Get duplicateds variants

    uniq=variants[-which(variants$variants %in% dups),]

    dupl=as.data.table(variants[which(variants$variants %in% dups),])

    dupl=dupl[,lapply(.SD,function(k){paste(collapse=',',unique(k))}),by='variants']

    dupl=as.data.frame.matrix(dupl)

    # Combine the matrix

    variants2=as.data.frame.matrix(rbind(uniq,dupl))

    # Remove TCGA_ and .txt

    variants2$tumor_num=gsub(variants2$tumor_num,pattern = 'TCGA_',replacement = "")

    variants2$tumor_num=gsub(variants2$tumor_num,pattern = '.txt',replacement = "",fixed = T)

    # Entrez_gene_Id and genesymbols

    tmp=matrix(unlist(strsplit(variants2$variants,'@')),byrow = T,ncol = 3)

    variants2=cbind(tmp,variants2)

    colnames(variants2)[1:3]=c('entrez','symbol','protein_change')

    variants2$protein_variants=paste0(variants2$symbol,'@',variants2$protein_change)

    variants2=variants2[,c(1:3,5)]

    return(variants2)

  }

  

#============================

# Merge table and annotate statics

#============================

  annotate_hotspots_2016.and.2017_20181203.1704=function(x=simple_maf,y=hotspot){

    x$merge_key=paste0(x$symbol,'@',x$protein_change)

    y$merge_key=paste0(y$Hugo_Symbol,'@',y$Reference_Amino_Acid,y$Amino_Acid_Position,y$Variant_Amino_Acid)

    wh=which(x$merge_key %in% y$merge_key)

    x$hotspot=NA

    x$hotspot[wh]='Hotspot'

    x=x[,c("entrez","symbol","protein_change","tumor_num","hotspot")]

    return(x)

  }

}



728x90
반응형

'Bioinformatics(생정보학)' 카테고리의 다른 글

Pharos: Target list DB  (0) 2021.11.19
WES alignment pipeline  (0) 2019.04.11
VEP variant location  (0) 2018.11.22
VEP for loop bash file  (0) 2018.11.14
Cancer subclone calculation  (0) 2018.10.02