R 代码积累不定期更新
1.阶乘、递归、reduce、sprintf
#NO.1
# 阶乘函数
fact <- function(n){
if(n==0) return(1) #基例在这
else return(n*fact(n-1))
}
#1+2!+3!+...+20!的和
#测试 Reduce(‘+‘, lapply(1:3,fact)) 结果是9
Reduce(‘+‘, lapply(1:20,fact))
#NO.2
#判断101-200之间有多少个素数,并输出所有素数
is.prime <- function(num) {
if (num == 2) {
TRUE
} else if (any(num %% 2:(num-1) == 0)) {
FALSE
} else {
TRUE
}
}
#101共有多少个素数
Reduce(‘+‘, lapply(101:200,is.prime))
#打印出每一个素数
for(i in 101:200){
if(!is.prime(i)) next
print(sprintf(‘%d 是素数 %s‘,i,is.prime(i)))
}
2.MD5加密卡号
library(magrittr)
library(digest)
library(data.table)
library(readxl)
library(lubridate)
dat=read_excel(‘data_union.xlsx‘,sheet=4)
dim(dat)
get_len <- function(a){
return(paste0(nchar(a),a))
}
dat$card_len=lapply(dat$bank_card, get_len)%>%unlist()
dat$card_md5 <- lapply(dat$card_len, digest, algo="md5",serialize=F)%>%unlist()
dat$card_md5_upper <- toupper(dat$card_md5)
write.table(
dat[,c(‘card_md5_upper‘)]
,‘yqb_card_md5.txt‘,row.names = F
,col.names=F
,quote = F)
#ym_apply
dat$ym_apply=dat$time %m-% months(1)%>%
format("%Y%m")%>%as.numeric()
#write out card_md5 and ym_apply
write.table(
dat[c(‘card_md5_upper‘,‘ym_apply‘)]
,sep=‘,‘
,‘yqb_card_ym_md5.txt‘,row.names = F
,col.names=F
,quote = F)
unique(dat$ym_apply)
3.时间函数
https://cran.r-project.org/web/packages/lubridate/vignettes/lubridate.html
4.随机森林可视化
library(randomForest)
library(kernlab) # spam数据集 来源于这个 包
library(magrittr)
library(caret)
#setwd(‘...‘)
#data <- read.csv(‘...‘)
data(spam) #加载数据集
# spam 是一个数据集,一条记录代表一封邮件包含哪些特殊词
# ,这封邮件是否是垃圾邮件(y变量)等等一些变量
View(spam)
table(spam$type) #看一下Y变量的分布,是否存在不均衡(imbalanced data)
#type 是y变量 也可以是0,1,但注意,如果你的数据集是 df, 要预先 将y变量转换为factor ,df$y <- as.factor(df$y)
set.seed(2016) #为使模型可再现,这里预先设定随机种子,因为随机森林的 每棵树对行是进行随机有放回的抽样的
rf <- randomForest(type~.
,data=spam
,importance=TRUE #保留变量的重要性 TURE的时候,可通过rf$importance 查看
,proximity=FALSE #在n棵树种,一个矩阵任意两条记录 落在同一个叶子节点的概率,可以表示两条记录的相似程度
,ntree=500 #种多少棵树
,do.trace=20 #每20条显示 误分率
,na.action=na.omit#一行中只要存在一个NA,这条记录就删掉
,strata=spam$type #按照Y变量中(0,1的比例,进行上下采样,对占比少的用oversampling,对占比多的用downsampling)
,sampsize=c(1500,1500) #对0,1采样的个数分别是多少,都是有放回的
,mtry=7 #每一棵决策树,选取几个feature?,对于classification 一般是feature总数的开平方(这个是default)
,keep.forest=FALSE) #保留每一棵数
rf$confusion #混淆矩阵
##变量重要性,如果你的数据是有上千个变量,可以根据变量的重要性对数据进行降维
par(mfrow = c(2,2))
for(i in 1:4){
plot(sort(rf$importance[,i],decreasing =TRUE)%>%head(20)
,type=‘h‘
,main=paste0(colnames(rf$importance)[i])
,xlab=‘variable‘
,ylab=‘importance‘)
text(sort(rf$importance[,i],decreasing =TRUE)
,labels=names(sort(rf$importance[,i]))%>%head(20)
,pos=1
,cex=0.9)
}
## 下面画ROC 曲线,计算AUC
library(ROCR)
predictions=as.vector(rf$votes[,2])
pred=prediction(predictions,spam$type)
perf_AUC=performance(pred,"auc") #Calculate the AUC value
AUC=perf_AUC@y.values[[1]]
perf_ROC=performance(pred,"tpr","fpr") #plot the actual ROC curve
plot(perf_ROC, main="ROC plot")
text(0.5,0.5,paste("AUC = ",format(AUC, digits=5, scientific=FALSE)))
#cutoff accuracy
perf <- performance(pred, measure="acc", x.measure="cutoff")
# Get the cutoff for the best accuracy
bestAccInd <- which.max(perf@"y.values"[[1]])
bestMsg <- paste("best accuracy=", perf@"y.values"[[1]][bestAccInd],
" at cutoff=", round(perf@"x.values"[[1]][bestAccInd], 4))
plot(perf, sub=bestMsg)
abline(v=round(perf@"x.values"[[1]][bestAccInd], 4))
# calculate the confusion matrix and plot
cm <- confusionMatrix(rf$predicted, reference = spam$type)
draw_confusion_matrix(cm)
#confusion matrix visualization
draw_confusion_matrix <- function(cm) {
layout(matrix(c(1,1,2)))
par(mar=c(2,2,2,2))
plot(c(100, 345), c(300, 450), type = "n", xlab="", ylab="", xaxt=‘n‘, yaxt=‘n‘)
title(‘CONFUSION MATRIX‘, cex.main=2)
# create the matrix
rect(150, 430, 240, 370, col=‘#3F97D0‘)
text(195, 435, rf$classes[1], cex=1.2)
rect(250, 430, 340, 370, col=‘#F7AD50‘)
text(295, 435, rf$classes[2], cex=1.2)
text(125, 370, ‘Predicted‘, cex=1.3, srt=90, font=2)
text(245, 450, ‘Actual‘, cex=1.3, font=2)
rect(150, 305, 240, 365, col=‘#F7AD50‘)
rect(250, 305, 340, 365, col=‘#3F97D0‘)
text(140, 400, rf$classes[1], cex=1.2, srt=90)
text(140, 335, rf$classes[2], cex=1.2, srt=90)
# add in the cm results
res <- as.numeric(cm$table)
text(195, 400, res[1], cex=1.6, font=2, col=‘white‘)
text(195, 335, res[2], cex=1.6, font=2, col=‘white‘)
text(295, 400, res[3], cex=1.6, font=2, col=‘white‘)
text(295, 335, res[4], cex=1.6, font=2, col=‘white‘)
# add in the specifics
plot(c(100, 0), c(100, 0), type = "n", xlab="", ylab="", main = "DETAILS", xaxt=‘n‘, yaxt=‘n‘)
text(10, 85, names(cm$byClass[1]), cex=1.2, font=2)
text(10, 70, round(as.numeric(cm$byClass[1]), 3), cex=1.2)
text(30, 85, names(cm$byClass[2]), cex=1.2, font=2)
text(30, 70, round(as.numeric(cm$byClass[2]), 3), cex=1.2)
text(50, 85, names(cm$byClass[5]), cex=1.2, font=2)
text(50, 70, round(as.numeric(cm$byClass[5]), 3), cex=1.2)
text(70, 85, names(cm$byClass[6]), cex=1.2, font=2)
text(70, 70, round(as.numeric(cm$byClass[6]), 3), cex=1.2)
text(90, 85, names(cm$byClass[7]), cex=1.2, font=2)
text(90, 70, round(as.numeric(cm$byClass[7]), 3), cex=1.2)
# add in the accuracy information
text(30, 35, names(cm$overall[1]), cex=1.5, font=2)
text(30, 20, round(as.numeric(cm$overall[1]), 3), cex=1.4)
text(70, 35, names(cm$overall[2]), cex=1.5, font=2)
text(70, 20, round(as.numeric(cm$overall[2]), 3), cex=1.4)
}
原文:http://www.cnblogs.com/litao1105/p/6915068.html