본문 바로가기

Bioinformatics(생정보학)

pMHC1 binding benchmark running script

728x90
반응형

pMHC1-binding-benchmark-running-script.py
0.01MB

 

'''
이 스크립트는 주어진 pMHC1 조합들에 대해 아래의 툴에 대한 score를 산출하는 스크립트임.
NetMHCpan 4.1
MixMHCpred2.0.2
DeepOmicsNEO
MHCflurry 2.1.0
mhcnuggets
'''
#파라미터 및 라이브러리 셋팅
params={
    'in':'/data01/dataset/neoantigen/public/iedb/20240308/minor-allele-testset.pickle'
    ,'out':'/data01/dataset/neoantigen/public/iedb/20240308/minor-allele-testset.pickle'
    # Softwares
    ,'mixmhcpred':'/data03/project/sjoh/00_tools/MixMHCpred/MixMHCpred'
    ,'netmhcpan':'/data03/project/sjoh/00_tools/netmhcpan/v4.1pvac/netmhcpan_4_1_executable/netMHCpan'
    ,'mhcflurry-docker-id':'openvax/mhcflurry:latest'
    ,'mhcnuggets-docker-id':'osj-mhcnuggets'
    ,'mhcnuggets-path':'/data03/project/sjoh/00_tools/mhcnuggets/mhcnuggets'
    #parameter
    ,'random-fold':10
}
params['workdir']='/'.join(params['out'].split('/')[:-1])


import subprocess as sbp
import pandas as pd
import gc



def loading_pMHC_table(path):
    if 'pickle' in path:
        f1=pd.read_pickle(path)
    elif '.tsv' in path:
        f1=pd.read_csv(path,sep='\t')
    elif '.csv' in path:
        f1=pd.read_csv(path,sep=',')
    else:
        print('Check the pMHC table')
        print('table column should contain [peptide,mhc]')
        warn
    return f1

indata={}
indata['in']=loading_pMHC_table(path=params['in'])


# adding random peptides
#x=indata['in'];pep_len=[8,12];fold=5
def random_peptide_generator(x,pep_len=[8,12],fold=10):
    '''
    x : indata['in']
    fold : 몇 배의 랜덤 펩타이드를 생성할지 결정
    '''
    import random
    amino_acids = [
        'A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L',
        'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y'
        ]
    counts=x.shape[0]*fold
    pep_len_range=[i for i in range(pep_len[0],pep_len[1]+1)]
    random_peptides=[]
    for i in range(counts):
        peplen=random.choices(pep_len_range,k=1)
        peptide=''.join(random.choices(amino_acids,k=peplen[0]))
        random_peptides.append(peptide)
    neg_df=pd.DataFrame({'peptide':random_peptides,'mhc':x['mhc'].tolist()*fold,'label':['Random']*counts})
    x1=pd.concat([x,neg_df],axis=0).reset_index(drop=1)
    return x1


pdata={}
pdata['in']=random_peptide_generator(x=indata['in'],pep_len=[8,12],fold=params['random-fold'])

# 중복되는 pMHC 제거
def remove_duplicate_and_contradictive_pMHC(x=pdata['in']):
    x1=x.groupby(['peptide','mhc']).agg({'label':lambda x: ','.join(x)}).reset_index(drop=0)
    x1=x1.loc[x1['label'].apply(lambda x: ',' not in x),:].reset_index(drop=1)
    return x1

pdata['in']=remove_duplicate_and_contradictive_pMHC(x=pdata['in'])

# mixMHCpred command line
#/path/to/MixMHCpred -i input_file -o output_file -a A0101,A2501,B0801,B1801
# -i Ideally a text file
def running_mixMHCpred(x=indata['in'],path=params['workdir'],mixmhcpred=params['mixmhcpred']):
    import subprocess as sbp
    import pandas as pd
    # Exporting the peptide file
    peptides='\n'.join(x['peptide'].unique())
    with open(path+'/mixMHCpred_peptide_input.txt','w') as f:
        f.write(peptides)
    # Get MHCs
    mhcs=x['mhc'].unique()
    mhcs=[i.split('-')[1].replace('*','').replace(':','') for i in mhcs]
    mhcs=','.join(mhcs)
    # Make command line and run it.
    cmd=f'{mixmhcpred} -i {path}/mixMHCpred_peptide_input.txt -o {path}/mixMHCpred-result.txt -a {mhcs}'
    sbp.call(cmd,shell=True)
    # Loading the result
    mm=pd.read_csv(f'{path}/mixMHCpred-result.txt',sep='\t',comment='#')
    # Attach the prediction result to original data
    mm=mm.drop(['Score_bestAllele','BestAllele','%Rank_bestAllele'],axis=1)
    x1=x.copy()
    x1['mixMHCkey']=x1['mhc'].apply(lambda x: x.split('-')[1].replace('*','').replace(':',''))
    mm1=mm.melt('Peptide')
    mm1[['category','mixMHCkey']]=mm1['variable'].str.split('_').tolist()
    mm2=mm1.pivot(index=['Peptide','mixMHCkey'],columns='category').reset_index().loc[:,['Peptide','mixMHCkey','value']]
    mm2.columns=['peptide','mixMHCkey','%Rank','Score']
    x2=pd.merge(x1,mm2,on=['peptide','mixMHCkey']).drop('mixMHCkey',axis=1)
    gc.collect()
    return x2


indata['mixMHCpred']=running_mixMHCpred(x=pdata['in'],path=params['workdir'],mixmhcpred=params['mixmhcpred'])

# NetMHCpan command line
#/path/to/netMHCpan -p test.pep -BA -a HLA-A01:01,HLA-A02:01 > output_file
def reading_netmhcpan_result(path):
    with open(path,'r') as f:
        f1=f.readlines()
        f.close()
    # Remove qutes
    content=[i for i in f1 if '#' not in i and len(i)>3]
    # Get header
    header=[i for i in content if 'Pos' in i and 'MHC' in i][0]
    content=[i for i in content if header not in i and '------------' not in i and 'Distance' not in i and 'Protein' not in i]
    header=header.split()[:16]
    # Cleaning content
    content=pd.DataFrame([i.split()[:16] for i in content],columns=header).drop(['Identity','Pos'],axis=1)
    # Change values into numeric
    for c in ['Score_EL','%Rank_EL','Score_BA','%Rank_BA','Aff(nM)']:
        try:
            content[c]=content[c].astype(float)
        except:
            continue
    return content


def running_NetMHCpan(x=indata['in'],path=params['workdir'],netmhcpan=params['netmhcpan']):
    import subprocess as sbp
    import pandas as pd
    # Exporting the peptide file
    peptides='\n'.join(x['peptide'].unique())
    with open(path+'/NetMHCpan_peptide_input.txt','w') as f:
        f.write(peptides)
    # Get MHCs
    mhcs=x['mhc'].unique()
    mhcs=[i.replace('*','') for i in mhcs]
    mhcs=','.join(mhcs)
    # Make command line and run it.
    print('Running NetMHCpan')
    cmd=f'{netmhcpan} -p {path}/NetMHCpan_peptide_input.txt -BA -a {mhcs} > {path}/NetMHCpan-result.txt'
    sbp.call(cmd,shell=True)
    # Loading the result
    nmp=reading_netmhcpan_result(path=f'{path}/NetMHCpan-result.txt')
    # Attach the prediction result to original data
    keep_col=['MHC','Peptide','Score_EL','%Rank_EL','Score_BA','%Rank_BA','Aff(nM)']
    nmp1=nmp.loc[:,keep_col].rename(columns={'MHC':'mhc','Peptide':'peptide'})
    x1=pd.merge(x,nmp1,on=['mhc','peptide'])
    gc.collect()
    return x1

indata['netmhcpan']=running_NetMHCpan(x=pdata['in'],path=params['workdir'],netmhcpan=params['netmhcpan'])


# mhcflurry
#x=indata['in'];path=params['workdir'];mhcflurry_docker='osj-mhcflurry:latest'
mhcflurry_docker_command="""
import pandas as pd
import mhcflurry
import warnings
params={
    'in':'/workdir/mhcflurry-input.pickle'
}
# Load a predictor
print('MHCflurry predictor loading')
warnings.simplefilter('ignore')
predictor = mhcflurry.Class1PresentationPredictor.load()
warnings.resetwarnings()
# Loading the files
indata={}
indata['in']=pd.read_pickle(params['in'])
# prediction
def mhcflurry_prediction(x):
    # 예측 실행
    df=pd.DataFrame([])
    for hla in x['mhc'].unique():
        peptide=list(x.loc[x['mhc']==hla,'peptide'].unique())
        hla2=hla.split('-')[1]
        res=predictor.predict(peptide, [hla2],include_affinity_percentile=True)
        res['mhc']=hla
        df=pd.concat([df,res],axis=0)
    df=df.reset_index(drop=1).drop(['peptide_num','sample_name'],axis=1)
    x1=pd.merge(x,df,on=['peptide','mhc'],how='left')
    return x1
pdata={}
pdata['pred']=mhcflurry_prediction(x=indata['in'])
pdata['pred'].to_csv('/workdir/mhcflurry-result.tsv',sep='\t',index=None)
# chmod777 부여
import subprocess as sbp
cmd='chmod 777 -R /workdir/*'
sbp.call(cmd,shell=True)
sbp.call('rm -rf /workdir/mhcflurry-input.pickle',shell=True)
print('파일 저장완료')
"""

def running_mhcflurry(x,path,mhcflurry_docker_command,mhcflurry_docker='openvax/mhcflurry:latest'):
    '''
    # parameters
    x : dataframe ['peptide','mhc','other columns;]
    path : /path/to/workdir
    mcflurry_docker_command : command line in the docker
    mhcflurry_docker : docker-name and tag
    # return
    dataframe with mhcflurry result
    '''
    import subprocess as sbp
    # Export the command line
    docker_runpy_path=path+'/mhcflurry-docker-run.py'
    with open(docker_runpy_path,'w') as f:
        f.write(mhcflurry_docker_command)
        f.close()
    # Export necessary things
    import pickle
    pickle_path=path+'/mhcflurry-input.pickle'
    with open(pickle_path,'wb') as f:
        pickle.dump(obj=x,file=f)
        f.close()
    print('Running the following command line')
    print(f"docker run -p 9999:9999 -v {path}:/workdir --rm openvax/mhcflurry:latest")
    print('Then modify the exmple1 and run the prediction')
    print('!python /workdir/mhcflurry-docker-run.py')
    print('mhcflurry-input is located at /workdir/mhcflurry-input.pickle')


running_mhcflurry(x=pdata['in'],path=params['workdir'],mhcflurry_docker_command=mhcflurry_docker_command)

import os
not_yet=True
print('Waiting the mhcflurry-result.tsv')
while not_yet:
    if os.path.exists(params['workdir']+'/mhcflurry-result.tsv'):
        indata['mhcflurry']=pd.read_csv(params['workdir']+'/mhcflurry-result.tsv',sep='\t')
        not_yet=False
        print('mhcflurry data loaded')


remove_col=['Unnamed: 0','best_allele']
for c in remove_col:
    if c in indata['mhcflurry'].columns:
        indata['mhcflurry']=indata['mhcflurry'].drop(c,axis=1)


#--------------------------------------------------------
# mhcnuggets running
#--------------------------------------------------------
mhcnuggets_cmd="""
from mhcnuggets.src.predict import predict
import sys, os
import pandas as pd
params={
    'peptide':'/workdir/mhcnuggets_peptide_input.txt'
    #'peptide':'/workdir/p230314v99-tesla_608_peptides.txt'
    ,'hla':'/workdir/hla.txt'
}

indata={}
with open(params['hla'],'r') as f:
    indata['hla']=f.readlines()
    f.close()

print('HLA',indata['hla'])
# 예측 시작
pdata={}
def mhcnuggets_prediction(pep_path=params['peptide'],hla=indata['hla']):
    stdoutOrigin=sys.stdout
    sys.stdout=open('/workdir/MHCnuggets-prediction.txt','w')
    for h in hla:
        print(h)
        predict(class_='I',peptides_path=pep_path,mhc=h,ba_models=True)
        print('#'*50)
    sys.stdout.close()
    sys.stdout=stdoutOrigin

print('Prediction finished')


mhcnuggets_prediction(pep_path=params['peptide'],hla=indata['hla'])

from copy import deepcopy as dc
def mhcnuggets_parsing():
    with open('/workdir/MHCnuggets-prediction.txt','r') as f:
        x=f.readlines()
        f.close()
    x1=[i.split(' ') for i in x]
    x1=sum(x1,[])
    # Get allele info
    hla=list(filter(lambda x: 'HLA-' in x and 'Closest' not in x,x1))[0].split()
    # prediction info
    content=list(filter(lambda x: ',' in x,x1))
    # 예측결과 (content)를 hla-allele대로 분배하기
    header_location=[idx for idx in range(len(content)) if 'peptide' in content[idx]]
    header_location+=[len(content)]
    x2=pd.DataFrame([])
    for idx,h in enumerate(hla): # h=hla[0];idx=0
        h1=content[header_location[idx]:header_location[idx+1]]
        h1=list(map(lambda x: x.split(','),h1))
        h2=pd.DataFrame(h1[1:],columns=h1[0])
        h2['HLA']=h
        x2=pd.concat([x2,h2])
    x2=x2.reset_index(drop=1)
    x2.columns=[c.split()[0] for c in x2.columns]
    for c in x2.columns:
        x2[c]=x2[c].apply(lambda x: x.split()[0])
    x2=x2.rename(columns={'ic50':'mhcnuggets_ic50'})
    return x2


pdata={}
pdata['mhcnuggets']=mhcnuggets_parsing()
pdata['mhcnuggets'].to_csv('/workdir/MHCnuggets-prediction.csv',index=None)
"""
#x=indata['in'];path=params['workdir'];mhcnuggets_dockerid=params['mhcnuggets-docker-id'];mhcnuggets_path=params['mhcnuggets-path']
def run_mhcnuggets(x,path,mhcnuggets_dockerid,mhcnuggets_cmd):
    '''
    도커에 뭔가 문제가 생겨서 할 수가 없음.
    x : dataframe with ['peptide','mhc','other columns']
    path : '/path/to/workdir'
    mhcnuggets_dockerid : 'osj-mhcflurry'
    mhcnuggets_cmd : mhcnuggets_cmd in docker
    '''
    import subprocess as sbp
    import pandas as pd
    import gc
    # Get MHCs
    mhcs=x['mhc'].unique()
    # Make command line and run it.
    print('Running MHCnuggets')
    with open(path+'/mhcnuggets-run.py','w') as f:
        f.write(mhcnuggets_cmd)
        f.close()
    result=[]
    for mhc in mhcs: #mhc=mhcs[0]
        mhc2=mhc.replace('*','')
        x1=x.loc[x['mhc']==mhc,'peptide'].unique()
        with open(path+'/mhcnuggets_peptide_input.txt','w') as f:
            f.write('\n'.join(x1))
            f.close()
        with open(path+'/hla.txt','w') as f:
            f.write(str(mhc2))
            f.close()
        cmd=f'docker run -v {path}:/workdir'
        cmd+=f' {mhcnuggets_dockerid} python /workdir/mhcnuggets-run.py'
        sbp.call(cmd,shell=True)    
        res=pd.read_csv(f'{path}/MHCnuggets-prediction.csv')
        result.append(res)
    df=pd.concat(result,axis=0).reset_index(drop=1)
    df=df.rename(columns={'HLA':'mhc'})
    df['mhc']=df['mhc'].apply(lambda x: x[:5]+'*'+x[5:])
    # Attach the prediction result to original data
    x1=pd.merge(x,df,on=['mhc','peptide'],how='left')
    x1['mhcnuggets_ic50']=x1['mhcnuggets_ic50'].astype(float)
    gc.collect()
    return x1


#
indata['mhcnuggets']=run_mhcnuggets(x=pdata['in'],path=params['workdir'],
mhcnuggets_dockerid=params['mhcnuggets-docker-id'],mhcnuggets_cmd=mhcnuggets_cmd)


# adding all
def adding_all_result(x=indata,tools=['mhcnuggets','mhcflurry','netmhcpan','mixMHCpred']):
    x1={t: indata[t] for t in tools}
    # Get identical columns
    for idx,t in enumerate(x1.keys()):
        if idx==0:
            keep_col=set(x1[t].columns.tolist())
        else:
            keep_col=keep_col & set(x1[t].columns.tolist())
    # Merge the result
    for idx,t in enumerate(x1.keys()): #t=tools[0]
        columns={i:f'{i}({t})' for i in set(x1[t].columns)-keep_col}
        t1=x1[t].rename(columns=columns)
        if idx==0:
            x2=t1.copy()
        else:
            x2=pd.merge(x2,t1,on=list(keep_col),how='outer')
    # return the result
    return x2


pdata['out']=adding_all_result(x=indata,tools=['mhcnuggets','mhcflurry','netmhcpan','mixMHCpred'])

pdata['out'].to_pickle(params['out'])
728x90
반응형