728x90
반응형
'''
이 스크립트는 주어진 pMHC1 조합들에 대해 아래의 툴에 대한 score를 산출하는 스크립트임.
NetMHCpan 4.1
MixMHCpred2.0.2
DeepOmicsNEO
MHCflurry 2.1.0
mhcnuggets
'''
#파라미터 및 라이브러리 셋팅
params={
'in':'/data01/dataset/neoantigen/public/iedb/20240308/minor-allele-testset.pickle'
,'out':'/data01/dataset/neoantigen/public/iedb/20240308/minor-allele-testset.pickle'
# Softwares
,'mixmhcpred':'/data03/project/sjoh/00_tools/MixMHCpred/MixMHCpred'
,'netmhcpan':'/data03/project/sjoh/00_tools/netmhcpan/v4.1pvac/netmhcpan_4_1_executable/netMHCpan'
,'mhcflurry-docker-id':'openvax/mhcflurry:latest'
,'mhcnuggets-docker-id':'osj-mhcnuggets'
,'mhcnuggets-path':'/data03/project/sjoh/00_tools/mhcnuggets/mhcnuggets'
#parameter
,'random-fold':10
}
params['workdir']='/'.join(params['out'].split('/')[:-1])
import subprocess as sbp
import pandas as pd
import gc
def loading_pMHC_table(path):
if 'pickle' in path:
f1=pd.read_pickle(path)
elif '.tsv' in path:
f1=pd.read_csv(path,sep='\t')
elif '.csv' in path:
f1=pd.read_csv(path,sep=',')
else:
print('Check the pMHC table')
print('table column should contain [peptide,mhc]')
warn
return f1
indata={}
indata['in']=loading_pMHC_table(path=params['in'])
# adding random peptides
#x=indata['in'];pep_len=[8,12];fold=5
def random_peptide_generator(x,pep_len=[8,12],fold=10):
'''
x : indata['in']
fold : 몇 배의 랜덤 펩타이드를 생성할지 결정
'''
import random
amino_acids = [
'A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L',
'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y'
]
counts=x.shape[0]*fold
pep_len_range=[i for i in range(pep_len[0],pep_len[1]+1)]
random_peptides=[]
for i in range(counts):
peplen=random.choices(pep_len_range,k=1)
peptide=''.join(random.choices(amino_acids,k=peplen[0]))
random_peptides.append(peptide)
neg_df=pd.DataFrame({'peptide':random_peptides,'mhc':x['mhc'].tolist()*fold,'label':['Random']*counts})
x1=pd.concat([x,neg_df],axis=0).reset_index(drop=1)
return x1
pdata={}
pdata['in']=random_peptide_generator(x=indata['in'],pep_len=[8,12],fold=params['random-fold'])
# 중복되는 pMHC 제거
def remove_duplicate_and_contradictive_pMHC(x=pdata['in']):
x1=x.groupby(['peptide','mhc']).agg({'label':lambda x: ','.join(x)}).reset_index(drop=0)
x1=x1.loc[x1['label'].apply(lambda x: ',' not in x),:].reset_index(drop=1)
return x1
pdata['in']=remove_duplicate_and_contradictive_pMHC(x=pdata['in'])
# mixMHCpred command line
#/path/to/MixMHCpred -i input_file -o output_file -a A0101,A2501,B0801,B1801
# -i Ideally a text file
def running_mixMHCpred(x=indata['in'],path=params['workdir'],mixmhcpred=params['mixmhcpred']):
import subprocess as sbp
import pandas as pd
# Exporting the peptide file
peptides='\n'.join(x['peptide'].unique())
with open(path+'/mixMHCpred_peptide_input.txt','w') as f:
f.write(peptides)
# Get MHCs
mhcs=x['mhc'].unique()
mhcs=[i.split('-')[1].replace('*','').replace(':','') for i in mhcs]
mhcs=','.join(mhcs)
# Make command line and run it.
cmd=f'{mixmhcpred} -i {path}/mixMHCpred_peptide_input.txt -o {path}/mixMHCpred-result.txt -a {mhcs}'
sbp.call(cmd,shell=True)
# Loading the result
mm=pd.read_csv(f'{path}/mixMHCpred-result.txt',sep='\t',comment='#')
# Attach the prediction result to original data
mm=mm.drop(['Score_bestAllele','BestAllele','%Rank_bestAllele'],axis=1)
x1=x.copy()
x1['mixMHCkey']=x1['mhc'].apply(lambda x: x.split('-')[1].replace('*','').replace(':',''))
mm1=mm.melt('Peptide')
mm1[['category','mixMHCkey']]=mm1['variable'].str.split('_').tolist()
mm2=mm1.pivot(index=['Peptide','mixMHCkey'],columns='category').reset_index().loc[:,['Peptide','mixMHCkey','value']]
mm2.columns=['peptide','mixMHCkey','%Rank','Score']
x2=pd.merge(x1,mm2,on=['peptide','mixMHCkey']).drop('mixMHCkey',axis=1)
gc.collect()
return x2
indata['mixMHCpred']=running_mixMHCpred(x=pdata['in'],path=params['workdir'],mixmhcpred=params['mixmhcpred'])
# NetMHCpan command line
#/path/to/netMHCpan -p test.pep -BA -a HLA-A01:01,HLA-A02:01 > output_file
def reading_netmhcpan_result(path):
with open(path,'r') as f:
f1=f.readlines()
f.close()
# Remove qutes
content=[i for i in f1 if '#' not in i and len(i)>3]
# Get header
header=[i for i in content if 'Pos' in i and 'MHC' in i][0]
content=[i for i in content if header not in i and '------------' not in i and 'Distance' not in i and 'Protein' not in i]
header=header.split()[:16]
# Cleaning content
content=pd.DataFrame([i.split()[:16] for i in content],columns=header).drop(['Identity','Pos'],axis=1)
# Change values into numeric
for c in ['Score_EL','%Rank_EL','Score_BA','%Rank_BA','Aff(nM)']:
try:
content[c]=content[c].astype(float)
except:
continue
return content
def running_NetMHCpan(x=indata['in'],path=params['workdir'],netmhcpan=params['netmhcpan']):
import subprocess as sbp
import pandas as pd
# Exporting the peptide file
peptides='\n'.join(x['peptide'].unique())
with open(path+'/NetMHCpan_peptide_input.txt','w') as f:
f.write(peptides)
# Get MHCs
mhcs=x['mhc'].unique()
mhcs=[i.replace('*','') for i in mhcs]
mhcs=','.join(mhcs)
# Make command line and run it.
print('Running NetMHCpan')
cmd=f'{netmhcpan} -p {path}/NetMHCpan_peptide_input.txt -BA -a {mhcs} > {path}/NetMHCpan-result.txt'
sbp.call(cmd,shell=True)
# Loading the result
nmp=reading_netmhcpan_result(path=f'{path}/NetMHCpan-result.txt')
# Attach the prediction result to original data
keep_col=['MHC','Peptide','Score_EL','%Rank_EL','Score_BA','%Rank_BA','Aff(nM)']
nmp1=nmp.loc[:,keep_col].rename(columns={'MHC':'mhc','Peptide':'peptide'})
x1=pd.merge(x,nmp1,on=['mhc','peptide'])
gc.collect()
return x1
indata['netmhcpan']=running_NetMHCpan(x=pdata['in'],path=params['workdir'],netmhcpan=params['netmhcpan'])
# mhcflurry
#x=indata['in'];path=params['workdir'];mhcflurry_docker='osj-mhcflurry:latest'
mhcflurry_docker_command="""
import pandas as pd
import mhcflurry
import warnings
params={
'in':'/workdir/mhcflurry-input.pickle'
}
# Load a predictor
print('MHCflurry predictor loading')
warnings.simplefilter('ignore')
predictor = mhcflurry.Class1PresentationPredictor.load()
warnings.resetwarnings()
# Loading the files
indata={}
indata['in']=pd.read_pickle(params['in'])
# prediction
def mhcflurry_prediction(x):
# 예측 실행
df=pd.DataFrame([])
for hla in x['mhc'].unique():
peptide=list(x.loc[x['mhc']==hla,'peptide'].unique())
hla2=hla.split('-')[1]
res=predictor.predict(peptide, [hla2],include_affinity_percentile=True)
res['mhc']=hla
df=pd.concat([df,res],axis=0)
df=df.reset_index(drop=1).drop(['peptide_num','sample_name'],axis=1)
x1=pd.merge(x,df,on=['peptide','mhc'],how='left')
return x1
pdata={}
pdata['pred']=mhcflurry_prediction(x=indata['in'])
pdata['pred'].to_csv('/workdir/mhcflurry-result.tsv',sep='\t',index=None)
# chmod777 부여
import subprocess as sbp
cmd='chmod 777 -R /workdir/*'
sbp.call(cmd,shell=True)
sbp.call('rm -rf /workdir/mhcflurry-input.pickle',shell=True)
print('파일 저장완료')
"""
def running_mhcflurry(x,path,mhcflurry_docker_command,mhcflurry_docker='openvax/mhcflurry:latest'):
'''
# parameters
x : dataframe ['peptide','mhc','other columns;]
path : /path/to/workdir
mcflurry_docker_command : command line in the docker
mhcflurry_docker : docker-name and tag
# return
dataframe with mhcflurry result
'''
import subprocess as sbp
# Export the command line
docker_runpy_path=path+'/mhcflurry-docker-run.py'
with open(docker_runpy_path,'w') as f:
f.write(mhcflurry_docker_command)
f.close()
# Export necessary things
import pickle
pickle_path=path+'/mhcflurry-input.pickle'
with open(pickle_path,'wb') as f:
pickle.dump(obj=x,file=f)
f.close()
print('Running the following command line')
print(f"docker run -p 9999:9999 -v {path}:/workdir --rm openvax/mhcflurry:latest")
print('Then modify the exmple1 and run the prediction')
print('!python /workdir/mhcflurry-docker-run.py')
print('mhcflurry-input is located at /workdir/mhcflurry-input.pickle')
running_mhcflurry(x=pdata['in'],path=params['workdir'],mhcflurry_docker_command=mhcflurry_docker_command)
import os
not_yet=True
print('Waiting the mhcflurry-result.tsv')
while not_yet:
if os.path.exists(params['workdir']+'/mhcflurry-result.tsv'):
indata['mhcflurry']=pd.read_csv(params['workdir']+'/mhcflurry-result.tsv',sep='\t')
not_yet=False
print('mhcflurry data loaded')
remove_col=['Unnamed: 0','best_allele']
for c in remove_col:
if c in indata['mhcflurry'].columns:
indata['mhcflurry']=indata['mhcflurry'].drop(c,axis=1)
#--------------------------------------------------------
# mhcnuggets running
#--------------------------------------------------------
mhcnuggets_cmd="""
from mhcnuggets.src.predict import predict
import sys, os
import pandas as pd
params={
'peptide':'/workdir/mhcnuggets_peptide_input.txt'
#'peptide':'/workdir/p230314v99-tesla_608_peptides.txt'
,'hla':'/workdir/hla.txt'
}
indata={}
with open(params['hla'],'r') as f:
indata['hla']=f.readlines()
f.close()
print('HLA',indata['hla'])
# 예측 시작
pdata={}
def mhcnuggets_prediction(pep_path=params['peptide'],hla=indata['hla']):
stdoutOrigin=sys.stdout
sys.stdout=open('/workdir/MHCnuggets-prediction.txt','w')
for h in hla:
print(h)
predict(class_='I',peptides_path=pep_path,mhc=h,ba_models=True)
print('#'*50)
sys.stdout.close()
sys.stdout=stdoutOrigin
print('Prediction finished')
mhcnuggets_prediction(pep_path=params['peptide'],hla=indata['hla'])
from copy import deepcopy as dc
def mhcnuggets_parsing():
with open('/workdir/MHCnuggets-prediction.txt','r') as f:
x=f.readlines()
f.close()
x1=[i.split(' ') for i in x]
x1=sum(x1,[])
# Get allele info
hla=list(filter(lambda x: 'HLA-' in x and 'Closest' not in x,x1))[0].split()
# prediction info
content=list(filter(lambda x: ',' in x,x1))
# 예측결과 (content)를 hla-allele대로 분배하기
header_location=[idx for idx in range(len(content)) if 'peptide' in content[idx]]
header_location+=[len(content)]
x2=pd.DataFrame([])
for idx,h in enumerate(hla): # h=hla[0];idx=0
h1=content[header_location[idx]:header_location[idx+1]]
h1=list(map(lambda x: x.split(','),h1))
h2=pd.DataFrame(h1[1:],columns=h1[0])
h2['HLA']=h
x2=pd.concat([x2,h2])
x2=x2.reset_index(drop=1)
x2.columns=[c.split()[0] for c in x2.columns]
for c in x2.columns:
x2[c]=x2[c].apply(lambda x: x.split()[0])
x2=x2.rename(columns={'ic50':'mhcnuggets_ic50'})
return x2
pdata={}
pdata['mhcnuggets']=mhcnuggets_parsing()
pdata['mhcnuggets'].to_csv('/workdir/MHCnuggets-prediction.csv',index=None)
"""
#x=indata['in'];path=params['workdir'];mhcnuggets_dockerid=params['mhcnuggets-docker-id'];mhcnuggets_path=params['mhcnuggets-path']
def run_mhcnuggets(x,path,mhcnuggets_dockerid,mhcnuggets_cmd):
'''
도커에 뭔가 문제가 생겨서 할 수가 없음.
x : dataframe with ['peptide','mhc','other columns']
path : '/path/to/workdir'
mhcnuggets_dockerid : 'osj-mhcflurry'
mhcnuggets_cmd : mhcnuggets_cmd in docker
'''
import subprocess as sbp
import pandas as pd
import gc
# Get MHCs
mhcs=x['mhc'].unique()
# Make command line and run it.
print('Running MHCnuggets')
with open(path+'/mhcnuggets-run.py','w') as f:
f.write(mhcnuggets_cmd)
f.close()
result=[]
for mhc in mhcs: #mhc=mhcs[0]
mhc2=mhc.replace('*','')
x1=x.loc[x['mhc']==mhc,'peptide'].unique()
with open(path+'/mhcnuggets_peptide_input.txt','w') as f:
f.write('\n'.join(x1))
f.close()
with open(path+'/hla.txt','w') as f:
f.write(str(mhc2))
f.close()
cmd=f'docker run -v {path}:/workdir'
cmd+=f' {mhcnuggets_dockerid} python /workdir/mhcnuggets-run.py'
sbp.call(cmd,shell=True)
res=pd.read_csv(f'{path}/MHCnuggets-prediction.csv')
result.append(res)
df=pd.concat(result,axis=0).reset_index(drop=1)
df=df.rename(columns={'HLA':'mhc'})
df['mhc']=df['mhc'].apply(lambda x: x[:5]+'*'+x[5:])
# Attach the prediction result to original data
x1=pd.merge(x,df,on=['mhc','peptide'],how='left')
x1['mhcnuggets_ic50']=x1['mhcnuggets_ic50'].astype(float)
gc.collect()
return x1
#
indata['mhcnuggets']=run_mhcnuggets(x=pdata['in'],path=params['workdir'],
mhcnuggets_dockerid=params['mhcnuggets-docker-id'],mhcnuggets_cmd=mhcnuggets_cmd)
# adding all
def adding_all_result(x=indata,tools=['mhcnuggets','mhcflurry','netmhcpan','mixMHCpred']):
x1={t: indata[t] for t in tools}
# Get identical columns
for idx,t in enumerate(x1.keys()):
if idx==0:
keep_col=set(x1[t].columns.tolist())
else:
keep_col=keep_col & set(x1[t].columns.tolist())
# Merge the result
for idx,t in enumerate(x1.keys()): #t=tools[0]
columns={i:f'{i}({t})' for i in set(x1[t].columns)-keep_col}
t1=x1[t].rename(columns=columns)
if idx==0:
x2=t1.copy()
else:
x2=pd.merge(x2,t1,on=list(keep_col),how='outer')
# return the result
return x2
pdata['out']=adding_all_result(x=indata,tools=['mhcnuggets','mhcflurry','netmhcpan','mixMHCpred'])
pdata['out'].to_pickle(params['out'])
728x90
반응형
'Bioinformatics(생정보학)' 카테고리의 다른 글
[de novo assembly] cap3 설치법 (0) | 2024.04.29 |
---|---|
Samtools 설치법 (0) | 2024.04.23 |
mhcnuggets 도커파일 (0) | 2024.03.18 |
mhcflurry 설치, 실행 및 결과 불러오기 (1) | 2024.03.15 |
MixMHCpred 실행 및 결과 불러오기 (0) | 2024.03.15 |