본문 바로가기

R관련

조선일보 크롤링 R코드 스케치

728x90
반응형

# Chosun ilbo Crawling 조선일보 크롤링 코드 스케치
library(XML)
library(RCurl)

# Readling html information
tmp=readLines('http://news.chosun.com/svc/list_in/list.html?catid=2&pn=1')
tmp=tmp[grep(tmp,pattern = 'a href')]
tmp=unlist(lapply(tmp,function(x){unlist(strsplit(x,'<a href='))[2]}))
tmp=unlist(lapply(tmp,function(x){unlist(strsplit(x,'://'))[2]}))
tmp=unlist(lapply(tmp,function(x){unlist(strsplit(x,'.html',fixed = T))[1]}))
tmp=tmp[grep(tmp,pattern = 'news.chosun.com')]
tmp=paste0(tmp,'.html')

library(KoNLP) # Loading KoNLP if you want to use Hannum algorithm
tmp2=c();for(i in 1:length(tmp)){
  tmp1=readLines(paste0('http://',tmp[i]))
  tmp1=tmp1[grep(tmp1,pattern = 'par')]
  if(grepl(tmp1,pattern = 'a href')){tmp1=tmp1[-grep(tmp1,pattern = 'a href')]}
  tmp1=unlist(strsplit(tmp1,'>'))
  tmp1=tmp1[grep(tmp1,pattern = '.',fixed = T)]
  tmp1=unlist(lapply(tmp1,function(x){unlist(strsplit(x,"<br"))[1]}))
  tmp1=unlist(lapply(tmp1,function(x){unlist(strsplit(x,"<scri"))[1]}))
  tmp1=unlist(lapply(tmp1,function(x){unlist(strsplit(x,"</div"))[1]}))
  tmp1=paste(collapse = "",tmp1)
  tmp1=gsub(tmp1,pattern = ".",replacement = " ",fixed = T)
  tmp1=unlist(strsplit(tmp1," "));tmp1=gsub(tmp1,pattern = "href='http://newsplus",replacement = "")
  tmp1=tmp1[which(nchar(tmp1)!=1)]
  tmp1=gsub(tmp1,pattern = '”',replacement = "")
  tmp1=gsub(tmp1,pattern = "“",replacement = "")
  tmp1=gsub(tmp1,pattern = "‘",replacement = "")
  tmp1=gsub(tmp1,pattern = "’",replacement = "")
  tmp1=gsub(tmp1,pattern = "!",replacement = "")
  tmp1=gsub(tmp1,pattern = '\\"',replacement = "")
  tmp1=unlist(strsplit(tmp1,"(",fixed = T))
  tmp1=unlist(strsplit(tmp1,")",fixed = T))
  tmp1=tmp1[which(nchar(tmp1)>1)]
  tmp1=gsub(tmp1,pattern = '라고',replacement="")
  tmp1=gsub(tmp1,pattern = '만큼',replacement="")
  tmp1=gsub(tmp1,pattern = '말했다',replacement="")
  tmp1=gsub(tmp1,pattern = '통해',replacement="")
  tmp2=c(tmp2,tmp1)}
tmp2=tmp2[-grep(tmp2,pattern='href=')]
tmp2=tmp2[-grep(tmp2,pattern='chosun')]
tmp2=tmp2[-grep(tmp2,pattern='<a')]
tmp2=tmp2[-grep(tmp2,pattern='com/')]
tmp2=tmp2[-grep(tmp2,pattern="style='")]
tmp2=tmp2[-grep(tmp2,pattern="html")]
tmp2=tmp2[-grep(tmp2,pattern="10px")]
tmp2=tmp2[-grep(tmp2,pattern="jsp")]
tmp2=tmp2[-grep(tmp2,pattern="_blank")]
tmp2=gsub(tmp2,pattern = "href='http://newsplus",replacement = "")
tmp2=gsub(tmp2,pattern = "html",replacement = "")
tmp2=gsub(tmp2,pattern = "stylemargin0",replacement = "")
tmp2=gsub(tmp2,pattern = "10px",replacement = "")
tmp2=gsub(tmp2,pattern = "compartnerindex",replacement = "")
tmp2=gsub(tmp2,pattern = "chosun",replacement = "")
tmp2=gsub(tmp2,pattern = "target='_blank'",replacement = "")
tmp2=gsub(tmp2,pattern = "jsp'",replacement = "")

library(RColorBrewer) # Load color
wordcloud(tmp2,random.color = T,random.order = F,rot.per = 0.25,min.freq = 1,
          colors = brewer.pal(9,'Set2'),scale = c(5,1))

728x90
반응형

'R관련' 카테고리의 다른 글

chord plot(코드 플랏)  (0) 2017.03.03
벤다이어 그램 그리기  (0) 2017.02.21
크롤링과 스크랩핑  (0) 2017.01.18
특정 패키지 인스톨 여부 확인  (0) 2017.01.16
단백질(protein) sequence 얻는 법  (0) 2017.01.16