Python教程(三十八):机器学习基础
ztj100 2025-08-07 00:06 4 浏览 0 评论
今日目标
o 理解机器学习的基本概念和类型
o 掌握scikit-learn库的使用
o 学会数据预处理和特征工程
o 了解监督学习和无监督学习
o 掌握模型评估和选择
机器学习概述
机器学习是人工智能的一个分支,让计算机从数据中学习模式:
o 监督学习:有标签数据,预测目标变量
o 无监督学习:无标签数据,发现数据模式
o 强化学习:通过与环境交互学习最优策略
机器学习工作流程
# 机器学习工作流程
def ml_workflow():
"""
1. 问题定义 - 明确预测目标
2. 数据收集 - 获取训练数据
3. 数据预处理 - 清洗和准备数据
4. 特征工程 - 创建和选择特征
5. 模型选择 - 选择合适的算法
6. 模型训练 - 训练模型参数
7. 模型评估 - 验证模型性能
8. 模型部署 - 应用模型预测
"""
pass
scikit-learn基础
1. 安装和导入
pip install scikit-learn matplotlib seaborn
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import datasets, preprocessing, model_selection, metrics
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
# 设置中文字体
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
2. 数据集加载
def load_datasets():
"""加载scikit-learn内置数据集"""
# 1. 鸢尾花数据集(分类问题)
iris = datasets.load_iris()
print(f"鸢尾花数据集:")
print(f" 特征数量: {iris.data.shape[1]}")
print(f" 样本数量: {iris.data.shape[0]}")
print(f" 目标类别: {iris.target_names}")
print(f" 特征名称: {iris.feature_names}")
# 2. 波士顿房价数据集(回归问题)
boston = datasets.load_boston()
print(f"\n波士顿房价数据集:")
print(f" 特征数量: {boston.data.shape[1]}")
print(f" 样本数量: {boston.data.shape[0]}")
print(f" 特征名称: {boston.feature_names}")
# 3. 手写数字数据集(分类问题)
digits = datasets.load_digits()
print(f"\n手写数字数据集:")
print(f" 特征数量: {digits.data.shape[1]}")
print(f" 样本数量: {digits.data.shape[0]}")
print(f" 目标类别: {digits.target_names}")
# 4. 乳腺癌数据集(分类问题)
cancer = datasets.load_breast_cancer()
print(f"\n乳腺癌数据集:")
print(f" 特征数量: {cancer.data.shape[1]}")
print(f" 样本数量: {cancer.data.shape[0]}")
print(f" 目标类别: {cancer.target_names}")
return iris, boston, digits, cancer
# 加载数据集
iris, boston, digits, cancer = load_datasets()
数据预处理
1. 数据标准化和归一化
def data_preprocessing():
"""数据预处理示例"""
# 使用鸢尾花数据集
X = iris.data
y = iris.target
print(f"原始数据形状: {X.shape}")
print(f"原始数据前5行:\n{X[:5]}")
# 1. 标准化(StandardScaler)
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
print(f"\n标准化后数据前5行:\n{X_scaled[:5]}")
print(f"标准化后均值: {X_scaled.mean(axis=0)}")
print(f"标准化后标准差: {X_scaled.std(axis=0)}")
# 2. 归一化(MinMaxScaler)
from sklearn.preprocessing import MinMaxScaler
minmax_scaler = MinMaxScaler()
X_normalized = minmax_scaler.fit_transform(X)
print(f"\n归一化后数据前5行:\n{X_normalized[:5]}")
print(f"归一化后最小值: {X_normalized.min(axis=0)}")
print(f"归一化后最大值: {X_normalized.max(axis=0)}")
# 3. 鲁棒缩放(RobustScaler)
from sklearn.preprocessing import RobustScaler
robust_scaler = RobustScaler()
X_robust = robust_scaler.fit_transform(X)
print(f"\n鲁棒缩放后数据前5行:\n{X_robust[:5]}")
# 4. 标签编码
from sklearn.preprocessing import LabelEncoder
# 创建示例分类数据
categories = ['红色', '蓝色', '绿色', '红色', '蓝色']
le = LabelEncoder()
categories_encoded = le.fit_transform(categories)
print(f"\n原始分类: {categories}")
print(f"编码后: {categories_encoded}")
print(f"类别映射: {dict(zip(le.classes_, le.transform(le.classes_)))}")
# 5. 独热编码
from sklearn.preprocessing import OneHotEncoder
categories_2d = np.array(categories).reshape(-1, 1)
ohe = OneHotEncoder(sparse=False)
categories_onehot = ohe.fit_transform(categories_2d)
print(f"\n独热编码:\n{categories_onehot}")
print(f"特征名称: {ohe.get_feature_names_out()}")
return {
'X_scaled': X_scaled,
'X_normalized': X_normalized,
'X_robust': X_robust,
'categories_encoded': categories_encoded,
'categories_onehot': categories_onehot
}
# 运行数据预处理
preprocessed_data = data_preprocessing()
2. 特征选择和降维
def feature_selection_and_reduction():
"""特征选择和降维示例"""
# 使用乳腺癌数据集(高维数据)
X = cancer.data
y = cancer.target
print(f"原始数据形状: {X.shape}")
# 1. 方差选择法
from sklearn.feature_selection import VarianceThreshold
selector = VarianceThreshold(threshold=0.01)
X_var_selected = selector.fit_transform(X)
print(f"\n方差选择后形状: {X_var_selected.shape}")
print(f"保留的特征数量: {X_var_selected.shape[1]}")
# 2. 单变量特征选择
from sklearn.feature_selection import SelectKBest, f_classif
selector_kbest = SelectKBest(score_func=f_classif, k=10)
X_kbest = selector_kbest.fit_transform(X, y)
print(f"\nK最佳特征选择后形状: {X_kbest.shape}")
print(f"特征重要性分数: {selector_kbest.scores_}")
# 3. 递归特征消除
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
estimator = LogisticRegression(max_iter=1000)
selector_rfe = RFE(estimator=estimator, n_features_to_select=10)
X_rfe = selector_rfe.fit_transform(X, y)
print(f"\n递归特征消除后形状: {X_rfe.shape}")
print(f"选择的特征: {selector_rfe.support_}")
# 4. 主成分分析(PCA)
pca = PCA(n_components=0.95) # 保留95%的方差
X_pca = pca.fit_transform(X)
print(f"\nPCA降维后形状: {X_pca.shape}")
print(f"解释方差比例: {pca.explained_variance_ratio_}")
print(f"累计解释方差: {np.sum(pca.explained_variance_ratio_):.3f}")
# 5. 线性判别分析(LDA)
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
lda = LinearDiscriminantAnalysis()
X_lda = lda.fit_transform(X, y)
print(f"\nLDA降维后形状: {X_lda.shape}")
print(f"解释方差比例: {lda.explained_variance_ratio_}")
# 6. 可视化降维结果
plt.figure(figsize=(15, 5))
# 原始数据(前两个特征)
plt.subplot(1, 3, 1)
plt.scatter(X[:, 0], X[:, 1], c=y, alpha=0.6)
plt.title('原始数据(前两个特征)')
plt.xlabel('特征1')
plt.ylabel('特征2')
# PCA降维
plt.subplot(1, 3, 2)
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=y, alpha=0.6)
plt.title('PCA降维')
plt.xlabel('主成分1')
plt.ylabel('主成分2')
# LDA降维
plt.subplot(1, 3, 3)
plt.scatter(X_lda[:, 0], X_lda[:, 0], c=y, alpha=0.6)
plt.title('LDA降维')
plt.xlabel('判别分量1')
plt.ylabel('判别分量1')
plt.tight_layout()
plt.show()
return {
'X_var_selected': X_var_selected,
'X_kbest': X_kbest,
'X_rfe': X_rfe,
'X_pca': X_pca,
'X_lda': X_lda
}
# 运行特征选择和降维
reduced_data = feature_selection_and_reduction()
监督学习
1. 分类问题
def classification_examples():
"""分类问题示例"""
# 使用鸢尾花数据集
X = iris.data
y = iris.target
# 数据分割
X_train, X_test, y_train, y_test = model_selection.train_test_split(
X, y, test_size=0.3, random_state=42, stratify=y
)
print(f"训练集大小: {X_train.shape}")
print(f"测试集大小: {X_test.shape}")
# 1. 逻辑回归
lr = LogisticRegression(random_state=42, max_iter=1000)
lr.fit(X_train, y_train)
lr_pred = lr.predict(X_test)
lr_score = lr.score(X_test, y_test)
print(f"\n逻辑回归准确率: {lr_score:.3f}")
# 2. 决策树
dt = DecisionTreeClassifier(random_state=42, max_depth=3)
dt.fit(X_train, y_train)
dt_pred = dt.predict(X_test)
dt_score = dt.score(X_test, y_test)
print(f"决策树准确率: {dt_score:.3f}")
# 3. 随机森林
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_test)
rf_score = rf.score(X_test, y_test)
print(f"随机森林准确率: {rf_score:.3f}")
# 4. 支持向量机
svm = SVC(kernel='rbf', random_state=42)
svm.fit(X_train, y_train)
svm_pred = svm.predict(X_test)
svm_score = svm.score(X_test, y_test)
print(f"支持向量机准确率: {svm_score:.3f}")
# 5. 模型比较
models = {
'逻辑回归': lr,
'决策树': dt,
'随机森林': rf,
'支持向量机': svm
}
results = {}
for name, model in models.items():
# 交叉验证
cv_scores = model_selection.cross_val_score(
model, X, y, cv=5, scoring='accuracy'
)
results[name] = {
'test_score': model.score(X_test, y_test),
'cv_mean': cv_scores.mean(),
'cv_std': cv_scores.std()
}
print(f"\n模型比较:")
for name, result in results.items():
print(f"{name}: 测试准确率={result['test_score']:.3f}, "
f"交叉验证={result['cv_mean']:.3f}±{result['cv_std']:.3f}")
# 6. 混淆矩阵
from sklearn.metrics import confusion_matrix, classification_report
plt.figure(figsize=(12, 8))
for i, (name, model) in enumerate(models.items(), 1):
plt.subplot(2, 2, i)
y_pred = model.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title(f'{name} - 混淆矩阵')
plt.xlabel('预测标签')
plt.ylabel('真实标签')
plt.tight_layout()
plt.show()
# 7. 分类报告
print(f"\n随机森林分类报告:")
print(classification_report(y_test, rf_pred, target_names=iris.target_names))
return models, results
# 运行分类示例
classification_models, classification_results = classification_examples()
2. 回归问题
def regression_examples():
"""回归问题示例"""
# 使用波士顿房价数据集
X = boston.data
y = boston.target
# 数据分割
X_train, X_test, y_train, y_test = model_selection.train_test_split(
X, y, test_size=0.3, random_state=42
)
print(f"训练集大小: {X_train.shape}")
print(f"测试集大小: {X_test.shape}")
# 1. 线性回归
lr = LinearRegression()
lr.fit(X_train, y_train)
lr_pred = lr.predict(X_test)
lr_score = lr.score(X_test, y_test)
print(f"\n线性回归 R^2 分数: {lr_score:.3f}")
# 2. 岭回归
from sklearn.linear_model import Ridge
ridge = Ridge(alpha=1.0)
ridge.fit(X_train, y_train)
ridge_pred = ridge.predict(X_test)
ridge_score = ridge.score(X_test, y_test)
print(f"岭回归 R^2 分数: {ridge_score:.3f}")
# 3. Lasso回归
from sklearn.linear_model import Lasso
lasso = Lasso(alpha=0.1)
lasso.fit(X_train, y_train)
lasso_pred = lasso.predict(X_test)
lasso_score = lasso.score(X_test, y_test)
print(f"Lasso回归 R^2 分数: {lasso_score:.3f}")
# 4. 随机森林回归
from sklearn.ensemble import RandomForestRegressor
rf_reg = RandomForestRegressor(n_estimators=100, random_state=42)
rf_reg.fit(X_train, y_train)
rf_pred = rf_reg.predict(X_test)
rf_score = rf_reg.score(X_test, y_test)
print(f"随机森林回归 R^2 分数: {rf_score:.3f}")
# 5. 支持向量回归
from sklearn.svm import SVR
svr = SVR(kernel='rbf')
svr.fit(X_train, y_train)
svr_pred = svr.predict(X_test)
svr_score = svr.score(X_test, y_test)
print(f"支持向量回归 R^2 分数: {svr_score:.3f}")
# 6. 模型比较
from sklearn.metrics import mean_squared_error, mean_absolute_error
models = {
'线性回归': lr,
'岭回归': ridge,
'Lasso回归': lasso,
'随机森林': rf_reg,
'支持向量回归': svr
}
results = {}
for name, model in models.items():
y_pred = model.predict(X_test)
results[name] = {
'R^2': model.score(X_test, y_test),
'MSE': mean_squared_error(y_test, y_pred),
'MAE': mean_absolute_error(y_test, y_pred)
}
print(f"\n回归模型比较:")
for name, result in results.items():
print(f"{name}: R^2={result['R^2']:.3f}, "
f"MSE={result['MSE']:.3f}, MAE={result['MAE']:.3f}")
# 7. 预测结果可视化
plt.figure(figsize=(15, 10))
for i, (name, model) in enumerate(models.items(), 1):
plt.subplot(2, 3, i)
y_pred = model.predict(X_test)
plt.scatter(y_test, y_pred, alpha=0.6)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
plt.xlabel('真实值')
plt.ylabel('预测值')
plt.title(f'{name}\nR^2 = {model.score(X_test, y_test):.3f}')
plt.tight_layout()
plt.show()
# 8. 特征重要性(随机森林)
feature_importance = pd.DataFrame({
'feature': boston.feature_names,
'importance': rf_reg.feature_importances_
}).sort_values('importance', ascending=False)
plt.figure(figsize=(10, 6))
plt.barh(feature_importance['feature'], feature_importance['importance'])
plt.xlabel('特征重要性')
plt.title('随机森林特征重要性')
plt.gca().invert_yaxis()
plt.show()
return models, results
# 运行回归示例
regression_models, regression_results = regression_examples()
无监督学习
1. 聚类分析
def clustering_examples():
"""聚类分析示例"""
# 使用鸢尾花数据集(去掉标签)
X = iris.data
y_true = iris.target
print(f"数据形状: {X.shape}")
# 1. K-means聚类
kmeans = KMeans(n_clusters=3, random_state=42)
kmeans_labels = kmeans.fit_predict(X)
print(f"\nK-means聚类结果:")
print(f"聚类中心: {kmeans.cluster_centers_.shape}")
print(f"聚类标签: {np.unique(kmeans_labels)}")
# 2. 层次聚类
from sklearn.cluster import AgglomerativeClustering
hierarchical = AgglomerativeClustering(n_clusters=3)
hierarchical_labels = hierarchical.fit_predict(X)
print(f"\n层次聚类结果:")
print(f"聚类标签: {np.unique(hierarchical_labels)}")
# 3. DBSCAN聚类
from sklearn.cluster import DBSCAN
dbscan = DBSCAN(eps=0.5, min_samples=5)
dbscan_labels = dbscan.fit_predict(X)
print(f"\nDBSCAN聚类结果:")
print(f"聚类标签: {np.unique(dbscan_labels)}")
print(f"噪声点数量: {np.sum(dbscan_labels == -1)}")
# 4. 聚类评估
from sklearn.metrics import silhouette_score, adjusted_rand_score
clustering_methods = {
'K-means': kmeans_labels,
'层次聚类': hierarchical_labels,
'DBSCAN': dbscan_labels
}
print(f"\n聚类评估:")
for name, labels in clustering_methods.items():
if len(np.unique(labels)) > 1: # 至少有两个聚类
silhouette = silhouette_score(X, labels)
ari = adjusted_rand_score(y_true, labels)
print(f"{name}: 轮廓系数={silhouette:.3f}, 调整兰德指数={ari:.3f}")
# 5. 聚类结果可视化
plt.figure(figsize=(15, 10))
# 真实标签
plt.subplot(2, 3, 1)
plt.scatter(X[:, 0], X[:, 1], c=y_true, cmap='viridis')
plt.title('真实标签')
plt.xlabel('萼片长度')
plt.ylabel('萼片宽度')
# K-means结果
plt.subplot(2, 3, 2)
plt.scatter(X[:, 0], X[:, 1], c=kmeans_labels, cmap='viridis')
plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1],
c='red', marker='x', s=200, linewidths=3)
plt.title('K-means聚类')
plt.xlabel('萼片长度')
plt.ylabel('萼片宽度')
# 层次聚类结果
plt.subplot(2, 3, 3)
plt.scatter(X[:, 0], X[:, 1], c=hierarchical_labels, cmap='viridis')
plt.title('层次聚类')
plt.xlabel('萼片长度')
plt.ylabel('萼片宽度')
# DBSCAN结果
plt.subplot(2, 3, 4)
plt.scatter(X[:, 0], X[:, 1], c=dbscan_labels, cmap='viridis')
plt.title('DBSCAN聚类')
plt.xlabel('萼片长度')
plt.ylabel('萼片宽度')
# 肘部法则(选择K值)
plt.subplot(2, 3, 5)
inertias = []
K_range = range(1, 11)
for k in K_range:
kmeans_temp = KMeans(n_clusters=k, random_state=42)
kmeans_temp.fit(X)
inertias.append(kmeans_temp.inertia_)
plt.plot(K_range, inertias, 'bo-')
plt.xlabel('聚类数量 (K)')
plt.ylabel('惯性')
plt.title('肘部法则')
# 轮廓系数
plt.subplot(2, 3, 6)
silhouette_scores = []
for k in K_range[1:]: # 从2开始
kmeans_temp = KMeans(n_clusters=k, random_state=42)
labels = kmeans_temp.fit_predict(X)
score = silhouette_score(X, labels)
silhouette_scores.append(score)
plt.plot(K_range[1:], silhouette_scores, 'ro-')
plt.xlabel('聚类数量 (K)')
plt.ylabel('轮廓系数')
plt.title('轮廓系数')
plt.tight_layout()
plt.show()
return clustering_methods
# 运行聚类示例
clustering_results = clustering_examples()
2. 降维和可视化
def dimensionality_reduction():
"""降维和可视化示例"""
# 使用手写数字数据集
X = digits.data
y = digits.target
print(f"原始数据形状: {X.shape}")
# 1. PCA降维
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)
print(f"PCA降维后形状: {X_pca.shape}")
print(f"解释方差比例: {pca.explained_variance_ratio_}")
# 2. t-SNE降维
from sklearn.manifold import TSNE
tsne = TSNE(n_components=2, random_state=42)
X_tsne = tsne.fit_transform(X)
print(f"t-SNE降维后形状: {X_tsne.shape}")
# 3. UMAP降维
try:
import umap
umap_reducer = umap.UMAP(random_state=42)
X_umap = umap_reducer.fit_transform(X)
print(f"UMAP降维后形状: {X_umap.shape}")
except ImportError:
print("UMAP未安装,跳过UMAP降维")
X_umap = None
# 4. 可视化降维结果
plt.figure(figsize=(15, 5))
# PCA结果
plt.subplot(1, 3, 1)
scatter = plt.scatter(X_pca[:, 0], X_pca[:, 1], c=y, cmap='tab10', alpha=0.6)
plt.title('PCA降维')
plt.xlabel('主成分1')
plt.ylabel('主成分2')
plt.colorbar(scatter)
# t-SNE结果
plt.subplot(1, 3, 2)
scatter = plt.scatter(X_tsne[:, 0], X_tsne[:, 1], c=y, cmap='tab10', alpha=0.6)
plt.title('t-SNE降维')
plt.xlabel('t-SNE 1')
plt.ylabel('t-SNE 2')
plt.colorbar(scatter)
# UMAP结果
if X_umap is not None:
plt.subplot(1, 3, 3)
scatter = plt.scatter(X_umap[:, 0], X_umap[:, 1], c=y, cmap='tab10', alpha=0.6)
plt.title('UMAP降维')
plt.xlabel('UMAP 1')
plt.ylabel('UMAP 2')
plt.colorbar(scatter)
plt.tight_layout()
plt.show()
# 5. 特征重要性可视化
plt.figure(figsize=(12, 4))
# 原始图像
plt.subplot(1, 3, 1)
plt.imshow(X[0].reshape(8, 8), cmap='gray')
plt.title('原始手写数字')
plt.axis('off')
# PCA重建
pca_full = PCA(n_components=0.95)
X_pca_full = pca_full.fit_transform(X)
X_reconstructed = pca_full.inverse_transform(X_pca_full)
plt.subplot(1, 3, 2)
plt.imshow(X_reconstructed[0].reshape(8, 8), cmap='gray')
plt.title(f'PCA重建 (保留95%方差)\n使用{pca_full.n_components_}个主成分')
plt.axis('off')
# 压缩比
compression_ratio = pca_full.n_components_ / X.shape[1]
plt.subplot(1, 3, 3)
plt.imshow(X_reconstructed[0].reshape(8, 8), cmap='gray')
plt.title(f'压缩比: {compression_ratio:.2%}')
plt.axis('off')
plt.tight_layout()
plt.show()
return {
'X_pca': X_pca,
'X_tsne': X_tsne,
'X_umap': X_umap,
'X_reconstructed': X_reconstructed
}
# 运行降维示例
reduction_results = dimensionality_reduction()
模型评估和选择
1. 交叉验证
def model_evaluation():
"""模型评估示例"""
# 使用鸢尾花数据集
X = iris.data
y = iris.target
# 定义多个模型
models = {
'逻辑回归': LogisticRegression(random_state=42, max_iter=1000),
'决策树': DecisionTreeClassifier(random_state=42),
'随机森林': RandomForestClassifier(n_estimators=100, random_state=42),
'支持向量机': SVC(random_state=42)
}
# 1. K折交叉验证
print("K折交叉验证结果:")
for name, model in models.items():
cv_scores = model_selection.cross_val_score(
model, X, y, cv=5, scoring='accuracy'
)
print(f"{name}: {cv_scores.mean():.3f} (+/- {cv_scores.std() * 2:.3f})")
# 2. 分层K折交叉验证
print(f"\n分层K折交叉验证结果:")
for name, model in models.items():
cv_scores = model_selection.cross_val_score(
model, X, y, cv=5, scoring='accuracy',
groups=y # 分层
)
print(f"{name}: {cv_scores.mean():.3f} (+/- {cv_scores.std() * 2:.3f})")
# 3. 学习曲线
from sklearn.model_selection import learning_curve
plt.figure(figsize=(15, 10))
for i, (name, model) in enumerate(models.items(), 1):
plt.subplot(2, 2, i)
train_sizes, train_scores, val_scores = learning_curve(
model, X, y, cv=5, n_jobs=-1,
train_sizes=np.linspace(0.1, 1.0, 10),
scoring='accuracy'
)
train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)
val_mean = np.mean(val_scores, axis=1)
val_std = np.std(val_scores, axis=1)
plt.plot(train_sizes, train_mean, 'o-', color='r', label='训练集')
plt.fill_between(train_sizes, train_mean - train_std, train_mean + train_std, alpha=0.1, color='r')
plt.plot(train_sizes, val_mean, 'o-', color='g', label='验证集')
plt.fill_between(train_sizes, val_mean - val_std, val_mean + val_std, alpha=0.1, color='g')
plt.xlabel('训练样本数')
plt.ylabel('准确率')
plt.title(f'{name} - 学习曲线')
plt.legend(loc='best')
plt.grid(True)
plt.tight_layout()
plt.show()
# 4. 网格搜索超参数调优
from sklearn.model_selection import GridSearchCV
# 随机森林参数网格
param_grid_rf = {
'n_estimators': [50, 100, 200],
'max_depth': [None, 10, 20],
'min_samples_split': [2, 5, 10]
}
grid_search_rf = GridSearchCV(
RandomForestClassifier(random_state=42),
param_grid_rf,
cv=5,
scoring='accuracy',
n_jobs=-1
)
grid_search_rf.fit(X, y)
print(f"\n随机森林最佳参数: {grid_search_rf.best_params_}")
print(f"最佳交叉验证分数: {grid_search_rf.best_score_:.3f}")
# 5. 随机搜索
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform, randint
param_distributions = {
'n_estimators': randint(50, 300),
'max_depth': [None] + list(range(5, 25)),
'min_samples_split': randint(2, 15),
'min_samples_leaf': randint(1, 10)
}
random_search = RandomizedSearchCV(
RandomForestClassifier(random_state=42),
param_distributions,
n_iter=100,
cv=5,
scoring='accuracy',
random_state=42,
n_jobs=-1
)
random_search.fit(X, y)
print(f"\n随机搜索最佳参数: {random_search.best_params_}")
print(f"最佳交叉验证分数: {random_search.best_score_:.3f}")
return {
'grid_search_rf': grid_search_rf,
'random_search': random_search
}
# 运行模型评估
evaluation_results = model_evaluation()
今日总结
今天我们学习了机器学习的基础知识:
1. 机器学习概述:监督学习、无监督学习、工作流程
2. 数据预处理:标准化、归一化、特征选择、降维
3. 监督学习:分类算法、回归算法、模型评估
4. 无监督学习:聚类分析、降维可视化
5. 模型评估:交叉验证、学习曲线、超参数调优
scikit-learn是Python机器学习的基础库,掌握这些知识可以开始构建机器学习模型。
练习建议
1. 在真实数据集上练习分类和回归
2. 尝试不同的特征工程方法
3. 比较不同算法的性能
4. 进行超参数调优实验
相关推荐
- 其实TensorFlow真的很水无非就这30篇熬夜练
-
好的!以下是TensorFlow需要掌握的核心内容,用列表形式呈现,简洁清晰(含表情符号,<300字):1.基础概念与环境TensorFlow架构(计算图、会话->EagerE...
- 交叉验证和超参数调整:如何优化你的机器学习模型
-
准确预测Fitbit的睡眠得分在本文的前两部分中,我获取了Fitbit的睡眠数据并对其进行预处理,将这些数据分为训练集、验证集和测试集,除此之外,我还训练了三种不同的机器学习模型并比较了它们的性能。在...
- 机器学习交叉验证全指南:原理、类型与实战技巧
-
机器学习模型常常需要大量数据,但它们如何与实时新数据协同工作也同样关键。交叉验证是一种通过将数据集分成若干部分、在部分数据上训练模型、在其余数据上测试模型的方法,用来检验模型的表现。这有助于发现过拟合...
- 深度学习中的类别激活热图可视化
-
作者:ValentinaAlto编译:ronghuaiyang导读使用Keras实现图像分类中的激活热图的可视化,帮助更有针对性...
- 超强,必会的机器学习评估指标
-
大侠幸会,在下全网同名[算法金]0基础转AI上岸,多个算法赛Top[日更万日,让更多人享受智能乐趣]构建机器学习模型的关键步骤是检查其性能,这是通过使用验证指标来完成的。选择正确的验证指...
- 机器学习入门教程-第六课:监督学习与非监督学习
-
1.回顾与引入上节课我们谈到了机器学习的一些实战技巧,比如如何处理数据、选择模型以及调整参数。今天,我们将更深入地探讨机器学习的两大类:监督学习和非监督学习。2.监督学习监督学习就像是有老师的教学...
- Python 模型部署不用愁!容器化实战,5 分钟搞定环境配置
-
你是不是也遇到过这种糟心事:花了好几天训练出的Python模型,在自己电脑上跑得顺顺当当,一放到服务器就各种报错。要么是Python版本不对,要么是依赖库冲突,折腾半天还是用不了。别再喊“我...
- 神经网络与传统统计方法的简单对比
-
传统的统计方法如...
- 自回归滞后模型进行多变量时间序列预测
-
下图显示了关于不同类型葡萄酒销量的月度多元时间序列。每种葡萄酒类型都是时间序列中的一个变量。假设要预测其中一个变量。比如,sparklingwine。如何建立一个模型来进行预测呢?一种常见的方...
- 苹果AI策略:慢哲学——科技行业的“长期主义”试金石
-
苹果AI策略的深度原创分析,结合技术伦理、商业逻辑与行业博弈,揭示其“慢哲学”背后的战略智慧:一、反常之举:AI狂潮中的“逆行者”当科技巨头深陷AI军备竞赛,苹果的克制显得格格不入:功能延期:App...
- 时间序列预测全攻略,6大模型代码实操
-
如果你对数据分析感兴趣,希望学习更多的方法论,希望听听经验分享,欢迎移步宝藏公众号...
你 发表评论:
欢迎- 一周热门
- 最近发表
- 标签列表
-
- idea eval reset (50)
- vue dispatch (70)
- update canceled (42)
- order by asc (53)
- spring gateway (67)
- 简单代码编程 贪吃蛇 (40)
- transforms.resize (33)
- redisson trylock (35)
- 卸载node (35)
- np.reshape (33)
- torch.arange (34)
- npm 源 (35)
- vue3 deep (35)
- win10 ssh (35)
- vue foreach (34)
- idea设置编码为utf8 (35)
- vue 数组添加元素 (34)
- std find (34)
- tablefield注解用途 (35)
- python str转json (34)
- java websocket客户端 (34)
- tensor.view (34)
- java jackson (34)
- vmware17pro最新密钥 (34)
- mysql单表最大数据量 (35)