2.PCA and feature selection.pptx
# PCA start
# I recommend that you set scale TRUE when doing PCA. Since variance influence the resulting PCs.
# Center=TRUE is to make median=0 and scale.=TRUE change unit variance equal
nc2=t(nc2) # Transpose the log2-CPM matrix, samples should be row and variables are on column
# Row : Samples
# Column : Variables such as genes
pc=prcomp(nc2,center = T,scale. = T)
print(pc) # You will see contribution values of each genes for individual Principal component
pc1=sort(abs(pc$rotation[,1]),decreasing = T) # Sort the genes according their contribution value
print(names(pc1[1:100])) # Top 100 contributing genes
# 3d pca plot
install.packages('rgl');library(rgl)
pc$x
cols=c(rep('red',3),rep('blue',3),rep('darkgreen',3),rep('orange',3),rep('purple',3))
plot3d(x=pc$x[,1],y=pc$x[,2],z=pc$x[,3],col=cols,size=10,
xlab='PC1',ylab='PC2',zlab='PC3') # x=PC1, y=PC2, z=PC3
text3d(x=pc$x[,1],y=pc$x[,2],z=pc$x[,3], # Add labels for points
text=c(rep('A',3),rep('B',3),rep('C',3),rep('D',3),rep('E',3)),add=T,size=10)
rgl.postscript('your_PCA_plot.pdf','pdf') # Export what you see as PDF format
# Rotate and see whether samples are separated well
install.packages('pheatmap');library(pheatmap)
pheatmap(nc[rownames(nc) %in% names(pc1[1:100]),],scale = 'row') # heatmap for PC1 top 100 genes
# PCA end
# 새 데이터 들어왔을 때 기존 pca로 데이터 모델에 적용하는 법
predict(pc, newdata='새데이터 넣을것')
http://planspace.org/2013/02/03/pca-3d-visualization-and-clustering-in-r/
'R관련' 카테고리의 다른 글
R 색깔 (0) | 2017.06.27 |
---|---|
Cancersubtype 패키지 (0) | 2017.06.09 |
Random forest (랜덤포레스트) feature selection (0) | 2017.05.16 |
R 버전 업데이트(update) (0) | 2017.04.28 |
3d plot 그리기 rgl package (0) | 2017.04.16 |