본문 바로가기

Bioinformatics(생정보학)

[python] di-peptide encoding

728x90
반응형

di peptide encoding은 단백질 서열을 벡터로 표현하는 방법 중 하나로 아미노산 2개의 조합 총 400가지가 어떻게 서열 안에 구성되어 있는지를 나타낸다.

 

import itertools
from collections import Counter
import numpy as np

# List of standard amino acids
amino_acids = "ACDEFGHIKLMNPQRSTVWY"

# Generate all possible di-peptides (combinations of two amino acids)
di_peptides = [''.join(pair) for pair in itertools.product(amino_acids, repeat=2)]

def dipeptide_encoding(seq):
    '''
    seq : peptide sequence
    return : 1 x 400 matrix
    '''
    # Generate all di-peptides
    dipeptides = [seq[i:i+2] for i in range(len(seq) - 1)]
    
    # Count occurrences of each di-peptide
    dipeptide_counts = Counter(dipeptides)
    
    # Normalize the counts by the length of the peptide sequence
    total_dipeptides = len(dipeptides)
    dipeptide_encoding = {dipeptide: count / total_dipeptides for dipeptide, count in dipeptide_counts.items()}
    
    # Initialize a 400-dimensional vector with zeros
    vector = np.zeros(400)
    
    # Fill the vector directly using numpy indexing
    for i, dipeptide in enumerate(di_peptides):
        vector[i] = dipeptide_encoding.get(dipeptide, 0)
    
    return vector.reshape(1, 400)

# 아래 것보다 좀 더 빠름.

 

아래는 위의 것보다 조금 느린 방식이다.

import itertools
from collections import Counter
import numpy as np

# List of standard amino acids
amino_acids = "ACDEFGHIKLMNPQRSTVWY"
# Generate all possible di-peptides (combinations of two amino acids)
di_peptides = tuple([''.join(pair) for pair in itertools.product(list(amino_acids), repeat=2)])

def dipeptide_encoding(seq):
    '''
    seq : peptide sequence
    return : 1 x 400 matrix
    '''
    #seq="AGSDFYRK"
    # Generate all di-peptides
    dipeptides = [seq[i:i+2] for i in range(len(seq) - 1)]
    # Count occurrences of each di-peptide
    dipeptide_counts = Counter(dipeptides)
    # Normalize the counts by the length of the peptide sequence
    total_dipeptides = len(dipeptides)
    dipeptide_encoding = {dipeptide: count / total_dipeptides for dipeptide, count in dipeptide_counts.items()}
    # Generate vector
    vector=np.zeros(400)
    # Encoding the sequence
    for key in dipeptide_encoding.keys():
        vector[di_peptides.index(key)]=dipeptide_encoding[key]
    return vector.reshape(1,400)

 

728x90
반응형