import numpy as np
import pandas as pd
import scipy.stats
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
# Add any additional imports here
# TODO
np.random.seed(416)
loans = pd.read_csv('lending-club-data.csv')
loans['safe_loans'] = loans['bad_loans'].apply(lambda x: +1 if x == 0 else -1)
loans = loans.drop(columns='bad_loans')
# Q1: Write code to find most frequent grade
# TODO
mode_grade = None
# Q2: Write code to find percent of loans for rent
# TODO
percent_rent = None
# Preprocess data
features = [
'grade', # grade of the loan (e.g. A or B)
'sub_grade', # sub-grade of the loan (e.g. A1, A2, B1)
'short_emp', # one year or less of employment (0 or 1)
'emp_length_num', # number of years of employment (a number)
'home_ownership', # home_ownership status (one of own, mortgage, rent or other)
'dti', # debt to income ratio (a number)
'purpose', # the purpose of the loan (one of many values)
'term', # the term of the loan (36 months or 60 months)
'last_delinq_none', # has borrower had a delinquincy (0 or 1)
'last_major_derog_none', # has borrower had 90 day or worse rating (0 or 1)
'revol_util', # percent of available credit being used (number between 0 and 100)
'total_rec_late_fee', # total late fees received to day (a number)
]
target = 'safe_loans' # prediction target (y) (+1 means safe, -1 is risky)
# Extract the feature columns and target column
loans = loans[features + [target]]
loans = pd.get_dummies(loans)
features = list(loans.columns)
features.remove('safe_loans')
train_data, validation_data = train_test_split(loans, test_size=0.2)
# Q3: Train a model with max_depth=6
# TODO
decision_tree_model = None
# Q4: Find train and validation accuracy
# TODO
decision_train_accuracy = None
decision_validation_accuracy = None
# Q5: Train a decision tree model with max_depth=10
# TODO
big_tree_model = None
big_train_accuracy = None
big_validation_accuracy = None
# Q6: Use GridSearchCV to find best settings of hyperparameters
# TODO
search = None
# Q7
class RandomForest416:
"""
This class implements the common sklearn model interface (has a fit and predict function).
A random forest is a collection of decision trees that are trained on random subsets of the
dataset. When predicting the value for an example, takes a majority vote from the trees.
"""
def __init__(self, num_trees, max_depth=1):
"""
Constructs a RandomForest416 that uses the given numbner of trees, each with a
max depth of max_depth.
"""
self._trees = [
DecisionTreeClassifier(max_depth=max_depth)
for i in range(num_trees)
]
def fit(self, X, y):
"""
Takes an input dataset X and a series of targets y and trains the RandomForest416.
Each tree will be trained on a random sample of the data that samples the examples
uniformly at random (with replacement). Each random dataset will have the same number
of examples as the original dataset, but some examples may be missing or appear more
than once due to the random sampling with replacement.
"""
# Q7
# TODO
def predict(self, X):
"""
Takes an input dataset X and returns the predictions for each example in X.
"""
# Builds up a 2d array with n rows and T columns
# where n is the number of points to classify and T is the number of trees
predictions = np.zeros((len(X), len(self._trees)))
for i, tree in enumerate(self._trees):
# Make predictions using the current tree
preds = tree.predict(X)
# Store those predictions in ith column of the 2d array
predictions[:, i] = preds
# For each row of predictions, find the most frequent label (axis=1 means across columns)
return scipy.stats.mode(predictions, axis=1)[0]
# Q7
# TODO
rf_train_accuracy = None
rf_validation_accuracy = None