import pandas as pd
titanic = pd.read_csv("titanic_train.csv")
"""首先分析表格中的数据,可以看到 Age 列中的数据是有缺失的,
因此,需要补齐所有空缺的数据,用中位数"""
titanic["Age"] = titanic["Age"].fillna(titanic["Age"].median())
titanic.loc[titanic["Sex"]=="male", "Sex"] = 0
titanic.loc[titanic["Sex"]=="female", "Sex"] = 1
titanic["Embarked"] = titanic["Embarked"].fillna('S')
titanic.loc[titanic["Embarked"]=="S", "Embarked"] = 0
titanic.loc[titanic["Embarked"]=="C", "Embarked"] = 1
titanic.loc[titanic["Embarked"]=="Q", "Embarked"] = 2
"""线性回归"""
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold
predictors = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"]
alg = LinearRegression()
通过交叉验证 KFold 生成 训练集与验证集
参数n_splits:将训练集分为 n 份,n份数据,每一份都要作为作为一次验证集来验证训练的结果,一共 n 次循环,其余n-1份数据作为训练集进行训练。
参数 shuffle:表示是否打乱数据的顺序 ,bool 类型
参数 random_state: 同一个数字保证每次循环都是分成同样的份
kf = KFold(n_splits=3, shuffle=False, random_state=None)
predictions= []
for train_index, test_index in kf.split(titanic):
train_predictors = (titanic[predictors].iloc[train_index,:])
train_target = titanic["Survived"].iloc[train_index]
alg.fit(train_predictors, train_target)
test_prediction = alg.predict(titanic[predictors].iloc[test_index,:])
predictions.append(test_prediction)
"""获得所有数据的验证结果后,就要来计算一下这个模型的准确率,
即正确预测存活率这个标签的概率。
import numpy as np
predictions = np.concatenate(predictions, axis=0)
predictions[predictions > .5] = 1
predictions[predictions <= .5] = 0
accuracy = sum(predictions == titanic["Survived"]) / len(predictions)
print(accuracy)
逻辑回归计算
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
alg2 = LogisticRegression(random_state=1)
scores = cross_val_score(alg2, titanic[predictors], titanic["Survived"], cv=3)
print(scores.mean())
用集成算法来提高结果的准确性,随机森林
from sklearn.ensemble import RandomForestClassifier
随机森林分类器的参数:
n_estimators: 随机森林中树的个数
min_samples_split: 剪枝时限制的最小深度
min_samples_leaf: 剪枝限制的叶子节点的最小样本数
alg3 = RandomForestClassifier(n_estimators=100, min_samples_split=4,min_samples_leaf=4, random_state=1)
kf3 = KFold(n_splits=3,shuffle=False, random_state=1)
scores3 = cross_val_score(alg3, titanic[predictors], titanic["Survived"], cv=kf3)
print(scores3.mean())
【JavaDebug(二)】之Mysql语法异常java.sql.SQLSyntaxErrorException: You have an error in your SQL syntax; chec
12428