\documentclass{article}
% Packages
\usepackage{amsmath,amsfonts,amsthm,amssymb,amsopn,bm}
\usepackage[margin=.9in]{geometry}
\usepackage{graphicx}
\usepackage{url}
\usepackage[usenames,dvipsnames]{color}
\usepackage{fancyhdr}
\usepackage{multirow}
\usepackage{hyperref}
\usepackage{listings}
\usepackage{xcolor}
\usepackage{booktabs}
% For telescoping sum
\usepackage{cancel}
\newcommand\hcancel[2][black]{\setbox0=\hbox{$#2$}%
\rlap{\raisebox{.45\ht0}{\textcolor{#1}{\rule{\wd0}{1pt}}}}#2}
% New colors defined below
\definecolor{codegreen}{rgb}{0,0.6,0}
\definecolor{codegray}{rgb}{0.5,0.5,0.5}
\definecolor{codepurple}{rgb}{0.58,0,0.82}
\definecolor{backcolour}{rgb}{0.98,0.98,0.98}
% Code listing style named "mystyle"
\lstdefinestyle{mystyle}{
backgroundcolor=\color{backcolour}, commentstyle=\color{codegreen},
keywordstyle=\color{magenta},
numberstyle=\tiny\color{codegray},
stringstyle=\color{codepurple},
basicstyle=\ttfamily\footnotesize,
breakatwhitespace=false,
breaklines=true,
captionpos=b,
keepspaces=true,
numbersep=5pt,
showspaces=false,
showstringspaces=false,
showtabs=false,
tabsize=2
}
%"mystyle" code listing set
\lstset{style=mystyle}
% For enumerate environment
\usepackage{enumitem}
\renewcommand{\theenumi}{\alph{enumi}}
\renewcommand{\labelenumi}{(\theenumi)}
% Math commands
\newcommand{\R}{\mathbb{R}}
\newcommand{\E}{\mathbb{E}}
\newcommand{\Var}{\mathrm{Var}}
\def\rvx{{\mathbf{x}}}
\def\rvy{{\mathbf{y}}}
\newcommand{\softmax}{\mathrm{softmax}}
\newcommand{\inv}{^{-1}}
% Formatting
\newcommand{\grade}[1]{\small\textcolor{magenta}{[#1 points]} \normalsize}
\date{{}}
% Solutions
\usepackage{ifthen}
\newboolean{showSolutions}
\setboolean{showSolutions}{false} % Change this to toggle solutions
\newcommand{\solution}[1]{\ifthenelse {\boolean{showSolutions}} {{\leavevmode\color{blue}\textbf{Solution:} #1}}{}}
% Comments
\newcommand{\hugh}{\textcolor{blue}}
\newcommand{\ian}{\textcolor{red}}
% No indent
\usepackage[parfill]{parskip}
\begin{document}
\title{Homework \#1}
\author{\normalsize{CSEP 590B: Explainable AI}\\
\normalsize{Prof. Su-In Lee} \\
\normalsize{Due: 4/25/22 11:59 PM}}
\maketitle
\section{Removal-based explanations (15 points)}
\begin{enumerate}
\item \grade{1} In the framework for removal-based explanations, what are the three implementation choices required for each method?
\item \grade{3} What are the specific design choices made by RISE (Petsiuk et al., 2018)?
\item \grade{3} What are the specific design choices made by SAGE (Covert et al., 2020)?
\item \grade{1} Which of the three design choices determines whether an explanation describes \textit{local} or \textit{global} feature importance?
\item \grade{3} Describe two benefits each for using local versus global explanations.
\item \grade{4} Why is SHAP sometimes viewed as a special case of LIME? \textbf{Hint:} focus on the summarization technique, the third implementation choice.
\end{enumerate}
\section{Shapley value properties (20 points)}
For a set of $d$ players represented by $D = \{1, \ldots, d\}$ and a cooperative game $v: 2^d \mapsto \R$, the Shapley value for each player $i \in D$ is defined as:
\begin{align}
\phi_i(v) &= \sum_{S \subseteq D \setminus \{i\}} \frac{|S|!(d - |S| - 1)!}{d!} (v(S\cup \{i\})-v(S)). \label{eq:shapley_semivalue}
\end{align}
\begin{enumerate}
\item \grade{3} Prove the \textit{null player} axiom of the Shapley value, which states that if a player contributes no value in a game $v: 2^d \mapsto \R$, or $v(S \cup \{i\}) - v(S) = 0$ for all $S \subseteq D \setminus \{i\}$, then $\phi_i(v) = 0$.
\item \grade{3} Prove the \textit{linearity} axiom of the Shapley value, which states that given two cooperative games $u: 2^d \mapsto \R$ and $v: 2^d \mapsto \R$ and a third game $w$ defined as $w(S) = u(S) + v(S)$ for all $S \subseteq D$, the following holds for all players $i \in D$:
\begin{equation*}
\phi_i(w) = \phi_i(u) + \phi_i(v).
\end{equation*}
\item \grade{4} Prove the \textit{efficiency} axiom of the Shapley value, which states that the following holds for all games $v: 2^d \mapsto \R$:
\begin{equation*}
\sum_{i = 1}^d \phi_i(v) = v(D) - v(\emptyset). \label{eq:efficiency}
\end{equation*}
\textbf{Hint:} recall the Shapley value's ordering interpretation (see Problem~\ref{sec:shapley_estimation}), and first show that the efficiency property holds when we consider only a single ordering.
% \clearpage
\item \grade{3} Calculate the Shapley value for all players $i \in \{1,2,3\}$ in the following cooperative game $v(S)$:
\begin{table}[!ht]
\centering
\begin{tabular}{|c|cccccccc|}
\hline
$S$ & $\emptyset$ & $\{1\}$ & $\{2\}$ & $\{3\}$ & $\{1, 2\}$ & $\{1, 3\}$ & $\{2, 3\}$ & $\{1, 2, 3\}$ \\ \hline
$v(S)$ & 0 & 1 & 1 & 1 & 2 & 2 & 2 & 3 \\ \hline
\end{tabular}
\end{table}
\item \grade{3} Calculate the Shapley value for all players $i \in \{1,2,3\}$ in the game $v(S)$ given by
\begin{equation*}
v(S)= x_1 + 2x_2 + 3x_3,
\end{equation*}
where $x_i$ are binary variables that are equal to 1 if $i \in S$ and 0 otherwise.
\item \grade{4} Calculate the Shapley values for all players $i \in \{1,2,3,4,5\}$ in the game $v(S)$ given by
\begin{equation*}
v(S)=1x_2 + 1x_3 + 2x_4 + 3x_1x_3 + 5x_2x_5 - 10x_1x_2x_4,
\end{equation*}
where $x_i$ are binary variables that are equal 1 if $i \in S$ and 0 otherwise. \textbf{Hint:} write a Python function to calculate the Shapley value automatically, using either the formula in Eq.~\ref{eq:shapley_semivalue} or Eq.~\ref{eq:shapley_permutation}.
\end{enumerate}
\section{Shapley value estimation (20 points)} \label{sec:shapley_estimation}
The Shapley values $\phi_1(v), \ldots, \phi_d(v)$ for a game $v: 2^d \mapsto \R$ can also be expressed in terms of player orderings. Let $\pi$ denote an ordering of the indices $\{1, \ldots, d\}$, let $\pi\inv(i)$ represent the position of $i$ in the ordering, and $\Pi$ be the set of all such orderings. For example, with $d = 2$ the only possible orderings are $\pi = [1, 2]$ and $[2, 1]$, so we have $\Pi = \{[1, 2], [2, 1]\}$, and given $\pi = [2, 1]$ we have $\pi\inv(1) = 2$ and $\pi\inv(2) = 1$.
The Shapley value for the $i$th player is then given by:
\begin{equation}
\phi_i(v) = \frac{1}{d!} \sum_{\pi \in \Pi} v\big(\{j : \pi\inv(j) \leq \pi\inv(i)\}\big) - v\big(\{j : \pi\inv(j) < \pi\inv(i)\}\big). \label{eq:shapley_permutation}
\end{equation}
We will first prove that this equation is equivalent to the one in Eq.~\ref{eq:shapley_semivalue}, and we will then consider how to use this formulation to approximate Shapley values when $d$ is large.
\begin{enumerate}
\item \grade{2} Given an ordering $\pi$, what do the sets $\{j : \pi\inv(j) < \pi\inv(i)\}$ and $\{j : \pi\inv(j) \leq \pi\inv(i)\}$ represent?
\item \grade{2} Define $\Delta(v, \pi, i) = v\big(\{j : \pi\inv(j) \leq \pi\inv(i)\}\big) - v\big(\{j : \pi\inv(j) < \pi\inv(i)\}\big)$. What does $\Delta(v, \pi, i)$ represent?
\item \grade{4} Fix a player $i \in D$ and a subset $S \subseteq D \setminus \{i\}$. How many orderings are there in the set $\Pi$, and in how many of these is $i$ preceded by exactly the set of indices in $S$? \textbf{Hint:} the indices in $S$ can appear in any order as long as they all precede $i$.
\item \grade{4} Using the above, prove that the formula in Eq.~\ref{eq:shapley_permutation} is equivalent to Eq.~\ref{eq:shapley_semivalue}. \textbf{Hint:} try grouping the terms in Eq.~\ref{eq:shapley_permutation} by the subset preceding $i$.
\item \grade{4} Shapley values are impractical to calculate for games with many players (large values of $d$), so we often resort to approximations. One simple approach involves sampling $n$ orderings $\pi_1, \ldots, \pi_n \in \Pi$ independently at random, and then calculating estimates for each Shapley value as follows:
\begin{equation*}
\hat \phi_i(v) = \frac{1}{n} \sum_{j = 1}^n \Delta(v, \pi_j, i).
\end{equation*}
Prove that $\hat \phi_i(v)$ is an unbiased estimator of $\phi_i(v)$, or that $\E[\hat \phi_i(v)] = \phi_i(v)$. \textbf{Hint:} the randomness in $\hat \phi_i(v)$ comes from the sampled orderings $\pi_1, \ldots, \pi_n$, and each possible ordering $\pi \in \Pi$ has probability $p(\pi) = \frac{1}{d!}$ because they are chosen at random.
\item \grade{4} For a single ordering $\pi \in \Pi$ sampled at random, denote $V_i = \Var(\Delta(v, \pi, i))$. What is $\Var(\hat \phi_i(v))$ in terms of $V_i$? What does this suggest about our estimator as the number of sampled orderings $n$ becomes large? \textbf{Hint:} see the similar question from HW0.
\end{enumerate}
\section{Permutation tests (20 points)}
\label{sec:perm_test}
A permutation test is a removal-based explanation that measures the impact on the dataset loss from removing individual features. The name comes from from the fact that features are removed by permuting (randomly shuffling) columns of the dataset, where each column corresponds to a single feature. Here, we'll implement permutation tests from scratch.
For this problem we'll use the \href{https://archive.ics.uci.edu/ml/datasets/adult}{census income classification} dataset, where the task is to predict if an individual has an income above \$50,000. A nicely formatted version of the dataset can be loaded from the \href{https://github.com/slundberg/shap}{SHAP} package, which you can install into your Python package manager (e.g., Anaconda, virtualenv) as follows:
\begin{lstlisting}[language=bash]
pip install shap
\end{lstlisting}
Once the SHAP package is installed, load the dataset and split it into a train and test set with the following code:
\begin{lstlisting}[language=Python]
import shap
from sklearn.model_selection import train_test_split
X,y = shap.datasets.adult() # Numerical version of data
X_display, y_display = shap.datasets.adult(display=True) # Human-readable data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=7)
\end{lstlisting}
\begin{enumerate}
\item \grade{4} Train a gradient boosting classifier using \texttt{sklearn} as follows (please use this exact code):
\begin{lstlisting}[language=Python]
from sklearn.ensemble import GradientBoostingClassifier
clf = GradientBoostingClassifier(n_estimators=100, random_state=10)
clf.fit(X_train.values, y_train)
\end{lstlisting}
Report the zero-one classification error (using a classification threshold of 0.5) and log-loss for both the train and test sets.
\item \grade{4} Implement the permutation test approach for the trained model. For each feature, measure the change in the test set zero-one error after permuting the corresponding feature column. Visualize the importance values with a bar plot.
\item \grade{4} Permutation tests are inherently stochastic because they permute features randomly. Run the permutation test from part (b) 10 times and visualize the variance in the importance values using a bar plot with standard deviation error bars.
\item \grade{4} One possible variation of the permutation test is to change the feature removal approach. Implement a new method similar to (b) that removes features by setting them to their mean instead of permuting them. Visualize the importance values using a bar plot.
\item \grade{4} Another possible variation is to change the model behavior we measure. Implement a new method similar to (b) that measures the change in the test set log-loss rather than the zero-one error. Visualize the importance values using a bar plot.
\end{enumerate}
\clearpage
\section{Popular feature importance packages (25 points)}
In this problem, we'll use LIME and SHAP to explain a tree-based model. \href{https://github.com/marcotcr/lime}{LIME} and \href{https://github.com/slundberg/shap}{SHAP} are widely used packages to calculate feature importance for ML models. SHAP implements multiple explanation methods (all variants on Shapley values), and we will use two of them in this problem. You'll need to install both packages as follows, preferably using a recent version of Python 3:
\begin{lstlisting}[language=bash]
pip install shap
pip install lime
\end{lstlisting}
Once again, this problem will use the \href{https://archive.ics.uci.edu/ml/datasets/adult}{census income classification} dataset. Load the data in the same way as in Problem~\ref{sec:perm_test}, using the exact same train/test split. In addition, please use the same model you trained in Problem~\ref{sec:perm_test}(a).
\begin{enumerate}
\item \grade{10} Compute local attributions for the first 200 samples from the test dataset (\texttt{X\_test[:200]}). We will call these samples \textit{explicands} since they are the samples being explained. Use the following methods (and note that KernelSHAP and LIME may take a while to run):
\begin{itemize}
\item \textbf{TreeSHAP}. Example:
\end{itemize}
\begin{lstlisting}[language=Python]
import shap
explainer = shap.TreeExplainer(clf, baselines)
attributions = explainer.shap_values(explicands)
\end{lstlisting}
\begin{itemize}
\item \textbf{KernelSHAP}. Example:
\end{itemize}
\begin{lstlisting}[language=Python]
import shap
explainer = shap.KernelExplainer(clf.predict_proba, baselines)
attributions = explainer.shap_values(explicands)[1]
\end{lstlisting}
\begin{itemize}
\item \textbf{LIME}\footnote{Note that LIME only supports explaining a single explicand at a time, so it must be run separately for each sample.}
% \footnote{You will likely have to reshape the attributions into the appropriate format since LIME will return them as a list of tuples where the first element of each tuple is the feature index and the second element is the attribution.}.
Example:
\end{itemize}
\begin{lstlisting}[language=Python]
import lime.lime_tabular
explainer = lime.lime_tabular.LimeTabularExplainer(X_train.values)
attribution = explainer.explain_instance(explicand, clf.predict_proba).local_exp[1]
# Note: you must reshape the attribution and run explain_instance for every explicand
\end{lstlisting}
Note that both TreeSHAP and KernelSHAP require a set of \textit{baselines}, which are used to replace held-out features with samples from their marginal distribution. For this problem, use the last 100 samples from the test set (\texttt{X\_test[-100:]}) as the baselines.
Visualize the results for each method using three separate summary plots:
(\texttt{shap.summary\_plot}):
\begin{lstlisting}[language=Python]
shap.summary_plot(attributions, explicands)
\end{lstlisting}
Based on these summary plots, what are the most important features according to each method?
\item \grade{5} Visualize the attributions from each method for the features ``Age'' and ``Hours per week'' with a scatter plot, putting the feature value on the x-axis and the feature attribution on the y-axis. Use the \texttt{matplotlib} library to generate the plots. Based on these plots, what observations can you make about how the these features relate to the model's income prediction?
\item \grade{5} Visualize the attributions from each method for the sample with index 1 in the test set (\texttt{X\_test.iloc[1]}) using a bar plot, with the feature names on the y-axis and the feature attributions on the x-axis . Based on the TreeSHAP attributions, describe one feature that makes a strong positive contribution to this sample's prediction and one feature that makes a strong negative contribution.
\item \grade{5} An important parameter for KernelSHAP is the number of samples (the \texttt{nsamples} argument in \texttt{KernelExplainer}'s \texttt{shap\_values()} function). Using more samples will improve the quality of the final feature attribution estimates, but it is more costly.\footnote{TreeExplainer does not have this parameter because it computes the marginal Shapley values exactly.} For the previous parts of this problem, we used the default setting of this parameter (see \href{https://github.com/slundberg/shap/blob/master/shap/explainers/_kernel.py#L117}{here}).
To investigate this parameter, run KernelSHAP for the sample with index 1 in the test set (\texttt{X\_test.iloc[1]}) with \texttt{nsamples=10}, \texttt{nsamples=100} and \texttt{nsamples=1000} for 10 runs each. Once again, use a bar plot to depict the mean local feature attributions, but this time include error bars to depict standard deviation across runs of the local feature attributions for each number of samples. Explain the result.
\end{enumerate}
\end{document}