This notebook was created by Jupyter AI with the following prompt:
/generate Scikit-learnライブラリの使い方のデモ
This Jupyter notebook provides a demonstration of how to use the Scikit-learn library for machine learning. It covers loading and preprocessing data, splitting it into training and testing sets, choosing an appropriate model, training and evaluating the model, tuning its parameters for better performance, and making predictions on new data. The notebook includes code examples for each step, using the Iris dataset and a CSV file as examples.
# Import necessary modules for preprocessing data
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
# Drop any rows with missing values
data.dropna(inplace=True)
# Separate the features and target variable
X = data.iloc[:,:-1]
y = data.iloc[:, -1]
# Encode non-numeric target variable
le = LabelEncoder()
y = le.fit_transform(y)
# Scale and impute the feature data
scaler = StandardScaler()
imputer = SimpleImputer(strategy='mean')
X = scaler.fit_transform(imputer.fit_transform(X))
# Print first 5 rows of preprocessed data
print(pd.DataFrame(data=X, columns=data.columns[:-1]).head())
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
print("Training Set Shape:", X_train.shape, y_train.shape)
print("Testing Set Shape:", X_test.shape, y_test.shape)
# Import necessary modules for choosing a model and evaluating it using cross-validation
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
# Create a list of models
models = [DecisionTreeClassifier(), KNeighborsClassifier(), LogisticRegression()]
# Evaluate each model using cross-validation and print the mean accuracy and standard deviation
for model in models:
scores = cross_val_score(model, X_train, y_train, cv=5)
print(f"Model: {type(model).__name__}, Mean Accuracy: {scores.mean():.3f}, Standard Deviation: {scores.std():.3f}")
# Choose the model with highest mean accuracy and fit it to the training data
best_model = KNeighborsClassifier()
best_model.fit(X_train, y_train)
best_model.fit(X_train, y_train)
test_score = best_model.score(X_test, y_test)
print("Testing Accuracy: {:.3f}".format(test_score))
from sklearn.model_selection import GridSearchCV
params = {'n_neighbors': [3, 5, 7], 'weights': ['uniform', 'distance'], 'p': [1, 2]}
grid_search = GridSearchCV(best_model, params, cv=5)
grid_search.fit(X_train, y_train)
print("Best Parameters:", grid_search.best_params_)
print("Best Mean Accuracy:", grid_search.best_score_)
tuned_model = KNeighborsClassifier(**grid_search.best_params_)
tuned_model.fit(X_train, y_train)
test_score_tuned = tuned_model.score(X_test, y_test)
print("Testing Accuracy (Tuned Model): {:.3f}".format(test_score_tuned))
# Use the trained model to make predictions on new data and print the predicted target values
new_data = [[5.1, 3.5, 1.4, 0.2], [6.2, 2.9, 4.3, 1.3], [7.7, 3.0, 6.1, 2.3]]
predictions = tuned_model.predict(scaler.transform(imputer.transform(new_data)))
print("Predicted Target Values:", predictions)