728x90
반응형
biopython을 잘 사용할 줄 알면 쉽게 할 수 있을지 모르겠으나
구글링을 해도 누군가 만들어 둔 것이 없고 튜토리얼은 nucleotide서열을 불러오는 것만 있어서
다른 라이브러리를 활용해 만듬.
import requests as r
from Bio import SeqIO
from io import StringIO
import mygene as mg
import pandas as pd
mg = mg.MyGeneInfo()
def gene2peptide(gene,taxid=9606):
'''
This function will return every uniprotID and peptide sequence for the given gene.
:param gene : gene ids
:param taxid : taxonomy id to select species (default '9606', human)
ncessary libraries are listed below
import requests as r
from Bio import SeqIO
from io import StringIO
import mygene as mg
import pandas as pd
mg = mg.MyGeneInfo()
'''
global mg, r, SeqIO, StringIO, pd
# Get official genesymbol
gene_infos=mg.query(gene)
# Select species
gene_select=[i for i in gene_infos['hits'] if taxid==i['taxid']][0]
selected_gene_keys=list(gene_select.keys())
necessary_infokeys=['_id','entrezgene']
# identify existence of necessary_infokeys
included_infokeys=[i for i in necessary_infokeys if i in selected_gene_keys]
gene_select=dict(zip(necessary_infokeys,[gene_select[i] for i in included_infokeys]))
# if there are failed keys, append them
if len(included_infokeys)!=len(necessary_infokeys):
failed_keys=list(set(necessary_infokeys)-set(included_infokeys))
infokeys_with_NA=dict(zip(failed_keys,['N/A']*len(failed_keys)))
gene_select.update(infokeys_with_NA) # This will append failed results
gene_select=[gene_select[i] for i in necessary_infokeys] # Sorting result based on necessary_infokeys
gene_select=dict(zip(necessary_infokeys,gene_select))
# Get uniprot gene
if gene_select['entrezgene']!='N/A':
gene_ids=mg.getgenes(gene_select['entrezgene'],
fields='symbol,entrezgene,uniprot.Swiss-Prot',as_dataframe=True)
gene_ids=gene_ids.drop(['_id','_version'],axis=1)
# Retrieve peptide sequences
peptide_seqs=[]
baseUrl="http://www.uniprot.org/uniprot/"
for i in range(gene_ids.shape[0]):
cID=gene_ids['uniprot.Swiss-Prot'][i]
currentUrl=baseUrl+cID+".fasta"
response = r.post(currentUrl)
cData=''.join(response.text)
Seq=StringIO(cData)
pSeq=list(SeqIO.parse(Seq,'fasta'))
peptide_seqs.append(str(pSeq[0].seq))
gene_ids['peptide']=peptide_seqs
else:
gene_ids=gene_select
gene_ids['uniprot.Swiss-Prot']='N/A'
gene_ids['peptide']='N/A'
gene_ids=pd.DataFrame([gene_ids])
gene_ids=gene_ids.drop(['_id'],axis=1)
gene_ids['input']=gene
return gene_ids
mg.getgenes([2064],fields='symbol,entrezgene,uniprot.Swiss-Prot',as_dataframe=True)
728x90
반응형
'파이썬3' 카테고리의 다른 글
파이썬 생존분석과 Kaplan-meier plot 그리기 (0) | 2022.01.13 |
---|---|
[아나콘다] git 설치 (0) | 2021.12.29 |
pandas 명령어들 (0) | 2021.11.26 |
drug-indication-collector.py (drugBank용) (0) | 2021.11.24 |
CLS archive SQLite에서 meta데이터 추출 (0) | 2020.10.23 |