본문 바로가기

파이썬3

drug-indication-collector.py (drugBank용)

728x90
반응형

drug-bank에서 약물의 indication부분만을 크롤링하기 위한 스크립트임.

 

# Get drug indication
import numpy as np
def drug_indication_extractor(soup):
    descriptions=soup.__str__().split('\n')
    identify_description=[]
    for i,j in enumerate(descriptions):
        if 'id="indication"' in j:
            identify_description.append(i)
    indication_string=descriptions[identify_description[0]:(identify_description[0]+2)]
    indication_text=[i.split('<p>')[1].split('<') for i in indication_string]
    indication_text=sum(indication_text,[])
    #i=sep[0]
    sep = ['sup>']
    for i in sep: #i=sep[0]
        i1=[j.split(i) for j in indication_text]
        indication_text=sum(i1,[])


    remove_characters = ['text-reference-group', 'a class="reference-popover-link', '/']
    pos_text=[]
    #i,j=indication_text[0],remove_characters[0]
    for i in indication_text:
        true_false=0
        for j in remove_characters:
            true_false+=j in i
        if true_false==0:
            pos_text.append(i)
    pos_string=''.join(pos_text)
    return pos_string

url='https://go.drugbank.com/drugs/DB08916' # Afatinib
response=requests.get(url)

html = response.text
soup = bs(html, 'html.parser')

drug_indication_extractor(soup=soup)
728x90
반응형