# Import necessary modules for preprocessing data
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer


# Drop any rows with missing values
data.dropna(inplace=True)


# Separate the features and target variable
X = data.iloc[:,:-1]
y = data.iloc[:, -1]


# Encode non-numeric target variable
le = LabelEncoder()
y = le.fit_transform(y)


# Scale and impute the feature data 
scaler = StandardScaler()
imputer = SimpleImputer(strategy='mean')
X = scaler.fit_transform(imputer.fit_transform(X))


# Print first 5 rows of preprocessed data
print(pd.DataFrame(data=X, columns=data.columns[:-1]).head())


from sklearn.model_selection import train_test_split


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


print("Training Set Shape:", X_train.shape, y_train.shape)
print("Testing Set Shape:", X_test.shape, y_test.shape)


# Import necessary modules for choosing a model and evaluating it using cross-validation
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score


# Create a list of models
models = [DecisionTreeClassifier(), KNeighborsClassifier(), LogisticRegression()]


# Evaluate each model using cross-validation and print the mean accuracy and standard deviation
for model in models:
    scores = cross_val_score(model, X_train, y_train, cv=5)
    print(f"Model: {type(model).__name__}, Mean Accuracy: {scores.mean():.3f}, Standard Deviation: {scores.std():.3f}")


# Choose the model with highest mean accuracy and fit it to the training data
best_model = KNeighborsClassifier()
best_model.fit(X_train, y_train)


best_model.fit(X_train, y_train)


test_score = best_model.score(X_test, y_test)
print("Testing Accuracy: {:.3f}".format(test_score))


from sklearn.model_selection import GridSearchCV


params = {'n_neighbors': [3, 5, 7], 'weights': ['uniform', 'distance'], 'p': [1, 2]}


grid_search = GridSearchCV(best_model, params, cv=5)


grid_search.fit(X_train, y_train)


print("Best Parameters:", grid_search.best_params_)
print("Best Mean Accuracy:", grid_search.best_score_)


tuned_model = KNeighborsClassifier(**grid_search.best_params_)
tuned_model.fit(X_train, y_train)


test_score_tuned = tuned_model.score(X_test, y_test)
print("Testing Accuracy (Tuned Model): {:.3f}".format(test_score_tuned))


# Use the trained model to make predictions on new data and print the predicted target values
new_data = [[5.1, 3.5, 1.4, 0.2], [6.2, 2.9, 4.3, 1.3], [7.7, 3.0, 6.1, 2.3]]
predictions = tuned_model.predict(scaler.transform(imputer.transform(new_data)))
print("Predicted Target Values:", predictions)

Scikit-learn Demo.¶

Introduction¶

Preprocessing Data¶

Splitting Data¶

Choosing a Model¶

Training the Model¶

Evaluating the Model¶

Tuning the Model¶

Making Predictions¶