import numpy as np import pandas as pd from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.decomposition import NMF np.random.seed(416) # Setup text = pd.read_csv('tweets-2020-4-30.csv') text = text.fillna('') # some rows are nan so replace with empty string vectorizer = TfidfVectorizer(max_df=0.95) tf_idf = vectorizer.fit_transform(text['text']) feature_names = vectorizer.get_feature_names() # Q1 # TODO compute num_tweets and num_words num_tweets = None num_words = None # Q2 # TODO create and fit the model and transform our data tweets_projected = None # Q3 # TODO q3 = None # Q4 small_words = ['dogs', 'cats', 'axolotl'] small_weights = np.array([1, 4, 2]) # TODO Write code to make sorted_small_words as described above sorted_small_words = None # Q5 def words_from_topic(topic, feature_names): """ Sorts the words by their weight in the given topic from largest to smallest. topic and feature_names should have the same number of entries. Args: - topic (np.array): A numpy array with one entry per word that shows the weight in this topic. - feature_names (list): A list of words that each entry in topic corresponds to Returns: - A list of words in feature_names sorted by weight in topic from largest to smallest. """ # TODO return None # Q6 # TODO look at the output above to identify which topic the tweet above is most associated to q6 = None # Q7 # TODO find index of largest topic largest_topic = None # Setup - Q8 nmf_small = NMF(n_components=3, init='nndsvd') tweets_projected_small = nmf_small.fit_transform(tf_idf) # Q8 outlier_tweets = None