生信代码：机器学习-训练模型开发者社区

生信代码：机器学习-训练模型

library(caret)
library(kernlab)
data(spam)
inTrain <- createDataPartition(y = spam$type,
                               p = 0.75, list = FALSE) #75%的数据作为训练集
training <- spam[inTrain, ]
testing <- spam[-inTrain, ]
dim(training)
[1] 3451   58

set.seed(32323)
folds <- createFolds(y = spam$type, k = 10, list = TRUE, returnTrain = TRUE)
sapply(folds, length) #查看每个子数据集的样本数量
Fold01 Fold02 Fold03 Fold04 Fold05 Fold06 Fold07 Fold08 Fold09 Fold10 
  4140   4142   4141   4140   4141   4141   4142   4141   4141   4140
folds[[1]][1:10] #查看第一个子数据集的前10个元素
[1]  1  2  4  5  6  7  8  9 10 11

set.seed(32323)
folds <- createFolds(y=spam$type, k=10, list = TRUE, returnTrain = FALSE)
sapply(folds, length)
Fold01 Fold02 Fold03 Fold04 Fold05 Fold06 Fold07 Fold08 Fold09 Fold10 
   461    459    460    461    460    460    459    460    460    461
folds[[1]][1:10]
[1]  3 19 33 38 44 51 66 67 72 83

set.seed(32323)
folds <- createResample(y = spam$type, times = 10, list = TRUE)
sapply(folds, length)
Resample01 Resample02 Resample03 Resample04 Resample05 Resample06 Resample07 Resample08 
      4601       4601       4601       4601       4601       4601       4601       4601 
Resample09 Resample10 
      4601       4601
folds[[1]][1:10]
[1] 1 1 2 2 3 3 4 5 5 7

set.seed(32323)
tme <- 1:1000 #创建一个时间序列数据
folds <- createTimeSlices(y = tme, initialWindow = 20, horizon = 10)
names(folds)
[1] "train" "test"
folds$train[[1]]
[1]  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20
folds$test[[1]]
[1] 21 22 23 24 25 26 27 28 29 30

library(caret)
library(kernlab)
data(spam)
inTrain <- createDataPartition(y = spam$type,
                               p = 0.75, list = FALSE) 
training <- spam[inTrain, ]
testing <- spam[-inTrain, ]
modelFit <- train(type ~., data = training, method="glm")

args(train.default)
function(x, y, method = "rf", preProcess = NULL, ..., weights = NULL,
         metric = ifelse(is.factor(y), "Accuracy", "RMSE"), maximize = ifelse(metric == "RMSE", FALSE, TRUE),
         trControl = trainControl(), tuneGrid = NULL, tuneLength = 3)
NULL

args(trainControl)
function (method = "boot", number = ifelse(grepl("cv", method), 
    10, 25), repeats = ifelse(grepl("[d_]cv$", method), 1, NA), 
    p = 0.75, search = "grid", initialWindow = NULL, horizon = 1, 
    fixedWindow = TRUE, skip = 0, verboseIter = FALSE, returnData = TRUE, 
    returnResamp = "final", savePredictions = FALSE, classProbs = FALSE, 
    summaryFunction = defaultSummary, selectionFunction = "best", 
    preProcOptions = list(thresh = 0.95, ICAcomp = 3, k = 5, 
        freqCut = 95/5, uniqueCut = 10, cutoff = 0.9), sampling = NULL, 
    index = NULL, indexOut = NULL, indexFinal = NULL, timingSamps = 0, 
    predictionBounds = rep(FALSE, 2), seeds = NA, adaptive = list(min = 5, 
        alpha = 0.05, method = "gls", complete = TRUE), trim = FALSE, 
    allowParallel = TRUE) 
NULL

set.seed(1235)
modekFit2 <- train(type ~., data = training, method = "glm")
modekFit2
Generalized Linear Model
3451 samples
  57 predictor
   2 classes: 'nonspam', 'spam'
No pre-processing
Resampling: Bootstrapped (25 reps) 
Summary of sample sizes: 3451, 3451, 3451, 3451, 3451, 3451, ... 
Resampling results:
  Accuracy   Kappa    
  0.9156324  0.8229977

library(ISLR)
library(ggplot2)
library(caret)
data(Wage)
summary(Wage)
      year           age                     maritl           race     
 Min.   :2003   Min.   :18.00   1. Never Married: 648   1. White:2480  
 1st Qu.:2004   1st Qu.:33.75   2. Married      :2074   2. Black: 293  
 Median :2006   Median :42.00   3. Widowed      :  19   3. Asian: 190  
 Mean   :2006   Mean   :42.41   4. Divorced     : 204   4. Other:  37  
 3rd Qu.:2008   3rd Qu.:51.00   5. Separated    :  55                  
 Max.   :2009   Max.   :80.00                                          
              education                     region    
 1. < HS Grad      :268   2. Middle Atlantic   :3000  
 2. HS Grad        :971   1. New England       :   0  
 3. Some College   :650   3. East North Central:   0  
 4. College Grad   :685   4. West North Central:   0  
 5. Advanced Degree:426   5. South Atlantic    :   0  
                          6. East South Central:   0  
                          (Other)              :   0
					  jobclass               health      health_ins  
 1. Industrial :1544   1. <=Good     : 858   1. Yes:2083  
 2. Information:1456   2. >=Very Good:2142   2. No : 917  
    logwage           wage       
 Min.   :3.000   Min.   : 20.09  
 1st Qu.:4.447   1st Qu.: 85.38  
 Median :4.653   Median :104.92  
 Mean   :4.654   Mean   :111.70  
 3rd Qu.:4.857   3rd Qu.:128.68  
 Max.   :5.763   Max.   :318.34

inTrain <- createDataPartition(y = Wage$wage, p = 0.7, list = FALSE)
training <- Wage[inTrain, ]
testing <- Wage[-inTrain, ]
dim(training)
[1] 2102   11
dim(testing)
[1] 898  11

featurePlot(x = training[, c("age", "education", "jobclass")],
            y = training$wage,
            plot = "pairs")

qplot(age, wage, data = training)

qplot(age, wage, color = jobclass, data = training)

qq <- qplot(age, wage, color = education, data = training)
qq + geom_smooth(method = 'lm', formula = y ~ x)

library(Hmisc)
cutWage <- cut2(training$wage, g = 3)
table(cutWage)
cutWage
[ 20.1, 91.7) [ 91.7,118.9) [118.9,318.3] 
          702           722           678

p1 <- qplot(cutWage, age, data = training, fill = cutWage,
            geom = c("boxplot"))
p1

p2 <- qplot(cutWage, age, data = training, fill = cutWage,
            geom = c("boxplot", "jitter"))
library(gridExtra)
grid.arrange(p1, p2, ncol = 2)

t1 <- table(cutWage, training$jobclass)
cutWage         1. Industrial 2. Information
  [ 20.1, 91.7)           459            243
  [ 91.7,118.9)           387            335
  [118.9,318.3]           276            402
prop.table(t1, 1)
cutWage         1. Industrial 2. Information
  [ 20.1, 91.7)     0.6538462      0.3461538
  [ 91.7,118.9)     0.5360111      0.4639889
  [118.9,318.3]     0.4070796      0.5929204

qplot(wage, color = education, data = training, geom = "density")

library(caret)
library(kernlab)
data(spam)
inTrain <- createDataPartition(y = spam$type,
                               p = 0.75, list = FALSE) 
training <- spam[inTrain, ]
testing <- spam[-inTrain, ]
hist(training$capitalAve, main = "", xlab = "ave. capital run length")

mean(training$capitalAve)
[1] 5.207716
sd(training$capitalAve)
[1] 30.09083

trainCapAve <- training$capitalAve
trainCapAveS <- (trainCapAve - mean(trainCapAve)) / sd(trainCapAve)
mean(trainCapAveS)
[1] 5.682636e-19
sd(trainCapAveS)
[1] 1

testCapAve <- testing$capitalAve
testCapAveS <- (testCapAve - mean(trainCapAve)) / sd(trainCapAve)
mean(testCapAveS)
[1] -0.002154109
sd(testCapAveS)
[1] 1.203646

preObj <- preProcess(training[,-58], method = c("center", "scale"))
trainCapAveS <- predict(preObj, training[,-58])$capitalAve
mean(trainCapAveS)
[1] 5.682636e-19
sd(trainCapAveS)
[1] 1

testCapAveS <- predict(preObj, testing[,-58])$capitalAve
mean(testCapAveS)
[1] -0.002154109
sd(testCapAveS)
[1] 1.203646

set.seed(32343)
modelFit <- train(type ~., data = training, 
                  preProcess = c("center", "scale"), method = "glm")
modelFit
Generalized Linear Model
3451 samples
  57 predictor
   2 classes: 'nonspam', 'spam'
Pre-processing: centered (57), scaled (57) 
Resampling: Bootstrapped (25 reps) 
Summary of sample sizes: 3451, 3451, 3451, 3451, 3451, 3451, ... 
Resampling results:
  Accuracy  Kappa    
  0.91793   0.8272674

preObj <- preProcess(training[,-58], method = c("BoxCox"))
trainCapAveS <- predict(preObj, training[,-58])$capitalAve
par(mfrow = c(1, 2))
hist(trainCapAveS)
qqnorm(trainCapAveS)

set.seed(13343)
#将原数据中的部分值设置为缺失值
training$capAve <- training$capitalAve
selectNA <- rbinom(dim(training)[1], size = 1, prob = 0.05)==1
training$capAve[selectNA] <- NA
#KNN算法填补缺失值
install.packages('RANN')
library(RANN)
preObj <- preProcess(training[,-58], method = "knnImpute")
capAve <- predict(preObj, training[,-58])$capAve
#标准化原数据
capAveTruth <- training$capitalAve
capAveTruth <-(capAveTruth - mean(capAveTruth)) / sd(capAveTruth)

quantile(capAve - capAveTruth) #填补缺失值之后与原数据值差异的分位数
           0%           25%           50%           75%          100% 
-11.840492704  -0.013557568  -0.006870429   0.007224872   6.258135049

生信代码：机器学习-训练模型

生信代码：机器学习-训练模型

数据分割

训练

绘制预测变量

注意：

数据预处理