본문 바로가기

파이썬3

유전자의 펩타이드 시퀀스 불러오는 함수

728x90
반응형

gene2peptide.py
0.00MB

biopython을 잘 사용할 줄 알면 쉽게 할 수 있을지 모르겠으나

 

구글링을 해도 누군가 만들어 둔 것이 없고 튜토리얼은 nucleotide서열을 불러오는 것만 있어서

 

다른 라이브러리를 활용해 만듬.

 

import requests as r
from Bio import SeqIO
from io import StringIO
import mygene as mg
import pandas as pd
mg = mg.MyGeneInfo()
def gene2peptide(gene,taxid=9606):
    '''
    This function will return every uniprotID and peptide sequence for the given gene.
    :param gene : gene ids
    :param taxid : taxonomy id to select species (default '9606', human)
    ncessary libraries are listed below
    import requests as r
    from Bio import SeqIO
    from io import StringIO
    import mygene as mg
    import pandas as pd
    mg = mg.MyGeneInfo()
    '''
    global mg, r, SeqIO, StringIO, pd
    # Get official genesymbol
    gene_infos=mg.query(gene)
    # Select species
    gene_select=[i for i in gene_infos['hits'] if taxid==i['taxid']][0]
    selected_gene_keys=list(gene_select.keys())
    necessary_infokeys=['_id','entrezgene']
    # identify existence of necessary_infokeys
    included_infokeys=[i for i in necessary_infokeys if i in selected_gene_keys]
    gene_select=dict(zip(necessary_infokeys,[gene_select[i] for i in included_infokeys]))
    # if there are failed keys, append them
    if len(included_infokeys)!=len(necessary_infokeys):
        failed_keys=list(set(necessary_infokeys)-set(included_infokeys))
        infokeys_with_NA=dict(zip(failed_keys,['N/A']*len(failed_keys)))
        gene_select.update(infokeys_with_NA) # This will append failed results
        gene_select=[gene_select[i] for i in necessary_infokeys] # Sorting result based on necessary_infokeys
        gene_select=dict(zip(necessary_infokeys,gene_select))
        
    # Get uniprot gene
    if gene_select['entrezgene']!='N/A':
        gene_ids=mg.getgenes(gene_select['entrezgene'],
                            fields='symbol,entrezgene,uniprot.Swiss-Prot',as_dataframe=True)
        gene_ids=gene_ids.drop(['_id','_version'],axis=1)
        # Retrieve peptide sequences
        peptide_seqs=[]
        baseUrl="http://www.uniprot.org/uniprot/"
        for i in range(gene_ids.shape[0]):
            cID=gene_ids['uniprot.Swiss-Prot'][i]
            currentUrl=baseUrl+cID+".fasta"
            response = r.post(currentUrl)
            cData=''.join(response.text)
            Seq=StringIO(cData)
            pSeq=list(SeqIO.parse(Seq,'fasta'))
            peptide_seqs.append(str(pSeq[0].seq))
        gene_ids['peptide']=peptide_seqs
    else:
        gene_ids=gene_select
        gene_ids['uniprot.Swiss-Prot']='N/A'
        gene_ids['peptide']='N/A'
        gene_ids=pd.DataFrame([gene_ids])
        gene_ids=gene_ids.drop(['_id'],axis=1)
    
    gene_ids['input']=gene
    return gene_ids

mg.getgenes([2064],fields='symbol,entrezgene,uniprot.Swiss-Prot',as_dataframe=True)
728x90
반응형