import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF

np.random.seed(416)

# Setup
text = pd.read_csv('tweets-2020-4-30.csv')
text = text.fillna('') # some rows are nan so replace with empty string
vectorizer = TfidfVectorizer(max_df=0.95)
tf_idf = vectorizer.fit_transform(text['text'])
feature_names = vectorizer.get_feature_names()

# Q1
# TODO compute num_tweets and num_words
num_tweets = None
num_words = None

# Q2
# TODO create and fit the model and transform our data
tweets_projected = None

# Q3
# TODO
q3 = None

# Q4
small_words = ['dogs', 'cats', 'axolotl']
small_weights = np.array([1, 4, 2])

# TODO Write code to make sorted_small_words as described above
sorted_small_words = None

# Q5
def words_from_topic(topic, feature_names):
    """
    Sorts the words by their weight in the given topic from largest to smallest.
    topic and feature_names should have the same number of entries.

    Args:
     - topic (np.array): A numpy array with one entry per word that shows the weight in this topic.
    - feature_names (list): A list of words that each entry in topic corresponds to

    Returns:
    - A list of words in feature_names sorted by weight in topic from largest to smallest. 
    """
    # TODO
    return None

# Q6
# TODO look at the output above to identify which topic the tweet above is most associated to
q6 = None

# Q7
# TODO find index of largest topic
largest_topic = None

# Setup - Q8
nmf_small = NMF(n_components=3, init='nndsvd')
tweets_projected_small = nmf_small.fit_transform(tf_idf)

# Q8
outlier_tweets = None