[IEDB] MHC ligand binding dataset 파싱 파이썬 스크립트.

파이썬3

[IEDB] MHC ligand binding dataset 파싱 파이썬 스크립트.

TTSR 2024. 3. 12. 12:48

728x90

def select_necessary_info_in_iedb(x):
    # x : mhc_ligand_full.tsv나 mhc_ligand_full.csv를 pd.read_csv로 불러들인 후의 DataFrame
    import pandas as pd
    from tqdm import tqdm
    import re
    import gc
    remove_col=['Reference.1','Epitope','Reference.4','Reference.3','Assay ID',
    'Reference.5','Reference.6','Reference.7','Epitope.1','Epitope.8','Epitope.9','Epitope.3',
    'Epitope.4','Epitope.5','Epitope.6','Epitope.7','Epitope.18','Related Object','Related Object.1',
    'Related Object.2','Related Object.3','Related Object.4','Related Object.5','Related Object.6',
    'Related Object.7','Related Object.8','Related Object.9','Related Object.10','Related Object.11','Related Object.12',
    'Related Object.13','Related Object.14','Host.2','Host.3','Host.4','Host.5','MHC Restriction.1',
    'in vivo Antigen.1','in vivo Antigen.2','in vivo Antigen.3','in vivo Antigen.4','in vivo Antigen.5',
    'in vivo Process','in vivo Process.1','in vivo Process.2','in vivo Process.3','in vivo Antigen',
    'in vivo Antigen.6','in vivo Antigen.7','in vivo Antigen.8','in vivo Antigen.9','in vivo Antigen.10',
    'in vivo Antigen.11','in vivo Antigen.12','in vivo Antigen.13','in vivo Antigen.14','in vivo Antigen.15',
    'in vivo Antigen.16','in vivo Antigen.17','In vitro Process','In vitro Process.1','In vitro Process.2',
    'In vitro Process.3','In vitro Process.4','In vitro Process.5','In vitro Process.6','In vitro Process.7',
    'In vitro Process.8','In vitro Process.9','In vitro Process.10','In vitro Process.11','In vitro Process.12',
    'In vitro Process.13','In vitro Process.14','In vitro Process.15','Antigen Processing','Assay','MHC Restriction.2',
    'Assay.6','Assay.7','Assay.8','Assay.9','Assay.10','Antigen Presenting Cell.3','Antigen Presenting Cell.4','Host.6']
    keep_col=['Reference.2','Epitope.2','Epitope.10','Epitope.11','Epitope.15','Epitope.12','MHC Restriction.3',
    'Epitope.13','Epitope.14','Epitope.16','Epitope.17','Host','Host.1','Assay.1','Assay.5',
    'MHC Restriction']
    x1=x.drop(remove_col,axis=1)#.drop_duplicates()
    x1=x1.loc[:,keep_col]
    # check redundant informations
    remove_col2=[]
    for i in range(x1.shape[1]):
        for i2 in range(i,x1.shape[1]):
            if i!=i2 and (x1.iloc[1:,i]==x1.iloc[1:,i2]).all():
                remove_col2.append(i2)
    if len(remove_col2)>0:
        cols=x1.columns.tolist()
        remove_col2=[cols[i] for i in remove_col2]
        x1=x1.drop(remove_col2,axis=1)
    # change column name
    x1=x1.drop(0,axis=0)
    column_name=['PMID', 'peptide', 'Source Molecule', 'Source Molecule IRI',
    'Source Organism IRI', 'Molecule Parent', 'Class',
    'Molecule Parent IRI', 'Source Organism', 'Species', 'Species IRI',
    'species', 'IRI', 'Method', 'Qualitative Measurement', 'mhc']
    x1.columns=column_name
    x1=x1.loc[x1['PMID'].isna()==False,:] # Remove experiments without reference
    x1['PMID']=x1['PMID'].astype(int)
    # Sorting the columns
    x1=x1.loc[:,['peptide', 'mhc','Qualitative Measurement','PMID',
    'Source Molecule', 'Source Molecule IRI',
    'Source Organism IRI', 'Molecule Parent', 'Class',
    'Molecule Parent IRI', 'Source Organism', 'Species', 'Species IRI',
    'species', 'IRI', 'Method']]
    # merging the redundant files
    # source molecule
    x1['source molecule']=x1['Molecule Parent'].astype(str)+';'+x1['Source Molecule'].astype(str)
    x1['source molecule']=x1['source molecule'].apply(lambda x: ';'.join(list(set(x.split(';')))))
    change_dict={}
    for k in tqdm(x1['source molecule'].unique(),desc='Merging source molecule'):
        # source molecule 이름 중에 긴쪽의 이름을 씀.
        if ';' in k:
            k1=k.upper().split(';')
            keys=k.split(';')
            if k1[0] in k1[1]:
                change_dict[k]=keys[1]
            elif k1[1] in k1[0]:
                change_dict[k]=keys[0]
    wh=x1['source molecule'].isin(change_dict.keys())
    x1_a=x1.loc[wh,:]
    for key in tqdm(change_dict.keys()):
        x1_a.loc[x1_a['source molecule']==key,'source molecule']=change_dict[key]
    x2=pd.concat([x1.loc[wh==False,:],x1_a],axis=0).reset_index(drop=1)
    del x1_a
    x2=x2.drop(['Molecule Parent','Source Molecule'],axis=1)
    x2=x2.drop_duplicates().reset_index(drop=1)
    # source molecule
    x2['Origin (peptide) species']=x2['Source Organism'].astype(str)+'_&&_'+x2['Species'].astype(str)
    change_dict={}
    for k in tqdm(x2['Origin (peptide) species'].unique(),desc='Peptide-source-merging'):
        # source molecule 이름 중에 긴쪽의 이름을 씀.
        if '_&&_' in k:
            k1=k.upper().split('_&&_')
            keys=k.split('_&&_')
            if k1[0] in k1[1]:
                change_dict[k]=keys[1]
            elif k1[1] in k1[0]:
                change_dict[k]=keys[0]
    wh=x2['Origin (peptide) species'].isin(change_dict.keys())
    x2_a=x2.loc[wh,:]
    for key in tqdm(change_dict.keys()):
        x2_a.loc[x2_a['Origin (peptide) species']==key,'Origin (peptide) species']=change_dict[key]
    x2=pd.concat([x2.loc[wh==False,:],x2_a],axis=0).reset_index(drop=1)
    x2=x2.drop(['Source Organism','Species'],axis=1)
    x2=x2.rename(columns={'species':'Host (MHC) species','Qualitative Measurement':'label','Class':'mhc class'})
    x2['Origin (peptide) species']=x2['Origin (peptide) species'].apply(lambda x: x.replace('_&&_',';'))
    x2=x2.drop(['Molecule Parent IRI'],axis=1) # source molecule IRI와 겹침
    x2=x2.rename(columns={'Species IRI':'Origin (peptide) species IRI','IRI':'Host (MHC) species IRI','Source Molecule IRI':'antigen IRI'})
    x2=x2.drop('Source Organism IRI',axis=1)
    gc.collect()
    return x2

위의 명령어를 실행한 후에는 용량이 그래도 많이 줄지만 더 줄일 때는 아래의 기능을 사용하면 된다.

구체적으로 펩타이드에 대한 분석만을 할 것이기 때문에 MHC와 비펩타이드 물질 간의 상호작용 데이터는 모두 제거한 후에 각 필드 별 정보를 dictionary로 저장한 데이터를 만드는 것이다.

# 데이터 용량 줄이기
# x=pdata['in']
def make_meta_info(x):
    # 펩타이드만 선택하기
    print('Selecting peptide result')
    amino_acids = set(['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L',
    'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y'])
    peptides=list(filter(lambda x: len(set(x)-amino_acids)==0,x['peptide'].unique()))
    x1=x.loc[x['peptide'].isin(peptides),:].reset_index(drop=1)
    del peptides
    gc.collect()
    # 데이터량 줄이기
    keys=['PMID', 'antigen IRI', 'mhc class',
       'Origin (peptide) species IRI', 'Host (MHC) species',
       'Host (MHC) species IRI', 'Method', 'source molecule',
       'Origin (peptide) species']
    meta_dict={}
    for idx,key in enumerate(keys): #idx=0
        x1[key]=x1[key].astype(str)
        field_item=x1[key].unique()
        field_key=[f'{idx}-{i}' for i in range(len(field_item))]
        meta_dict[key]=dict(zip(field_key,field_item))
        tmp_dict=dict(zip(field_item,field_key))
        x1[key]=x1[key].apply(lambda x: tmp_dict[x])
    # 데이터를 줄이기
    x1=x1.reset_index(drop=0)
    return {'data': x1, 'meta':meta_dict}

IEDB-T의 경우는 아래의 명령어를 사용하면 된다.

def select_necessary_info_in_iedb_T(x):
    # x : mhc_ligand_full.tsv나 mhc_ligand_full.csv를 pd.read_csv로 불러들인 후의 DataFrame
    import pandas as pd
    from tqdm import tqdm
    import re, gc
    keep_col=['Reference.2','Epitope.2','Epitope.10','Epitope.11','Epitope.15','Epitope.12','MHC Restriction.3',
    'Epitope.13','Epitope.14','Epitope.16','Epitope.17','Host','Host.1','Assay.1','Assay.5',
    'MHC Restriction']
    x1=x.loc[:,keep_col]
    # check redundant informations
    remove_col2=[]
    for i in range(x1.shape[1]):
        for i2 in range(i,x1.shape[1]):
            if i!=i2 and (x1.iloc[1:,i]==x1.iloc[1:,i2]).all():
                remove_col2.append(i2)
    if len(remove_col2)>0:
        cols=x1.columns.tolist()
        remove_col2=[cols[i] for i in remove_col2]
        x1=x1.drop(remove_col2,axis=1)
    # change column name
    x1=x1.drop(0,axis=0)
    column_name=['PMID', 'peptide', 'Source Molecule', 'Source Molecule IRI',
    'Source Organism IRI', 'Molecule Parent', 'Class',
    'Molecule Parent IRI', 'Source Organism', 'Species', 'Species IRI',
    'species', 'IRI', 'Method', 'Qualitative Measurement', 'mhc']
    x1.columns=column_name
    x1=x1.loc[x1['PMID'].isna()==False,:] # Remove experiments without reference
    x1['PMID']=x1['PMID'].astype(int)
    # Sorting the columns
    x1=x1.loc[:,['peptide', 'mhc','Qualitative Measurement','PMID',
    'Source Molecule', 'Source Molecule IRI',
    'Source Organism IRI', 'Molecule Parent', 'Class',
    'Molecule Parent IRI', 'Source Organism', 'Species', 'Species IRI',
    'species', 'IRI', 'Method']]
    # merging the redundant files
    # source molecule
    x1['source molecule']=x1['Molecule Parent'].astype(str)+';'+x1['Source Molecule'].astype(str)
    x1['source molecule']=x1['source molecule'].apply(lambda x: ';'.join(list(set(x.split(';')))))
    change_dict={}
    for k in tqdm(x1['source molecule'].unique(),desc='Merging source molecule'):
        # source molecule 이름 중에 긴쪽의 이름을 씀.
        if ';' in k:
            k1=k.upper().split(';')
            keys=k.split(';')
            if k1[0] in k1[1]:
                change_dict[k]=keys[1]
            elif k1[1] in k1[0]:
                change_dict[k]=keys[0]
    wh=x1['source molecule'].isin(change_dict.keys())
    x1_a=x1.loc[wh,:]
    for key in tqdm(change_dict.keys()):
        x1_a.loc[x1_a['source molecule']==key,'source molecule']=change_dict[key]
    x2=pd.concat([x1.loc[wh==False,:],x1_a],axis=0).reset_index(drop=1)
    del x1_a
    x2=x2.drop(['Molecule Parent','Source Molecule'],axis=1)
    x2=x2.drop_duplicates().reset_index(drop=1)
    # source molecule
    x2['Origin (peptide) species']=x2['Source Organism'].astype(str)+'_&&_'+x2['Species'].astype(str)
    change_dict={}
    for k in tqdm(x2['Origin (peptide) species'].unique(),desc='Peptide-source-merging'):
        # source molecule 이름 중에 긴쪽의 이름을 씀.
        if '_&&_' in k:
            k1=k.upper().split('_&&_')
            keys=k.split('_&&_')
            if k1[0] in k1[1]:
                change_dict[k]=keys[1]
            elif k1[1] in k1[0]:
                change_dict[k]=keys[0]
    wh=x2['Origin (peptide) species'].isin(change_dict.keys())
    x2_a=x2.loc[wh,:]
    for key in tqdm(change_dict.keys()):
        x2_a.loc[x2_a['Origin (peptide) species']==key,'Origin (peptide) species']=change_dict[key]
    x2=pd.concat([x2.loc[wh==False,:],x2_a],axis=0).reset_index(drop=1)
    x2=x2.drop(['Source Organism','Species'],axis=1)
    x2=x2.rename(columns={'species':'Host (MHC) species','Qualitative Measurement':'label','Class':'mhc class'})
    x2['Origin (peptide) species']=x2['Origin (peptide) species'].apply(lambda x: x.replace('_&&_',';'))
    x2=x2.drop(['Molecule Parent IRI'],axis=1) # source molecule IRI와 겹침
    x2=x2.rename(columns={'Species IRI':'Origin (peptide) species IRI','IRI':'Host (MHC) species IRI','Source Molecule IRI':'antigen IRI'})
    x2=x2.drop('Source Organism IRI',axis=1)
    gc.collect()
    return x2

728x90