# Importing necessary libraries and downloading appropriate packages
import numpy as np
import pandas as pd
import nltk
import re
import random
import math
from pandas.tseries import offsets
from gensim.models import Word2Vec
import matplotlib.pyplot as plt
from sklearn import svm
from sklearn import metrics
from sklearn.manifold import TSNE
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from nltk.stem.porter import *
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize
from nltk.stem.porter import PorterStemmer
from wordcloud import WordCloud
from gensim.corpora.dictionary import Dictionary
from gensim.models.ldamodel import LdaModel
nltk.download('punkt')
nltk.download('stopwords')


# Installing tool to access Genius API, where we will obtain the lyrics
!pip install git+https://github.com/johnwmillr/LyricsGenius.git
from lyricsgenius import Genius


# Accessing Genius API using our token and searching for 6 of Drake's albums
genius = Genius("mVRVsAuDeNkIUzqzTsZ3HcnUowYCvhQCtc1gf2WRM-iO46_7syYrCerNv8fNh6-0")
too_late = genius.search_album("If You're Reading This It's Too Late", "Drake")
take_care = genius.search_album("Take Care", "Drake")
views = genius.search_album("Views", "Drake")
more_life = genius.search_album("More Life", "Drake")
scorpion = genius.search_album("Scorpion", "Drake")
clb = genius.search_album("Certified Lover Boy", "Drake")


# Getting song names from each album
def getSongNames(album):
  song_names = []
  for track in album.tracks:
    song_names.append(track.song.title)
  return song_names

too_late_song_names = getSongNames(too_late)
take_care_song_names = getSongNames(take_care)
views_song_names = getSongNames(views)
more_life_song_names = getSongNames(more_life)
scorpion_song_names = getSongNames(scorpion)
clb_song_names = getSongNames(clb)
print(too_late_song_names)
print(take_care_song_names)
print(views_song_names)
print(more_life_song_names)
print(scorpion_song_names)
print(clb_song_names)

['Legend', 'Energy', '10 Bands', 'Know Yourself', 'No Tellin’', 'Madonna', '6 God', 'Star67', 'Preach', 'Wednesday Night Interlude', 'Used To', '6 Man', 'Now & Forever', 'Company', 'You & The 6', 'Jungle', '6PM in New York', 'How Bout Now', 'My Side']
['Over My Dead Body', 'Shot for Me', 'Headlines', 'Crew Love', 'Take Care', 'Marvin’s Room', 'Buried Alive Interlude', 'Under Ground Kings', 'We’ll Be Fine', 'Make Me Proud', 'Lord Knows', 'Cameras', 'Good Ones Go (Interlude)', 'Doing It Wrong', 'The Real Her', 'Look What You’ve Done', 'HYFR (Hell Ya Fucking Right)', 'Practice', 'The Ride', 'The Motto', 'Hate Sleeping Alone', 'The Motto (Remix)']
['Keep the Family Close', '9', 'U With Me?', 'Feel No Ways', 'Hype', 'Weston Road Flows', 'Redemption', 'With You', 'Faithful', 'Still Here', 'Controlla', 'One Dance', 'Grammys', 'Child’s Play', 'Pop Style', 'Too Good', 'Summers Over Interlude', 'Fire & Desire', 'Views', 'Hotline Bling']
['Free Smoke', 'No Long Talk', 'Passionfruit', 'Jorja Interlude', 'Get It Together', 'Madiba Riddim', 'Blem', '4422', 'Gyalchester', 'Skepta Interlude', 'Portland', 'Sacrifices', 'Nothings Into Somethings', 'Teenage Fever', 'KMT', 'Lose You', 'Can’t Have Everything', 'Glow', 'Since Way Back', 'Fake Love', 'Ice Melts', 'Do Not Disturb', 'More Life [Note]']
['Survival', 'Nonstop', 'Elevate', 'Emotionless', 'God’s Plan', 'I’m Upset', '8 Out of 10', 'Mob Ties', 'Can’t Take a Joke', 'Sandra’s Rose', 'Talk Up', 'Is There More', 'Peak', 'Summer Games', 'Jaded', 'Nice For What', 'Finesse', 'Ratchet Happy Birthday', 'That’s How You Feel', 'Blue Tint', 'In My Feelings', 'Don’t Matter to Me', 'After Dark', 'Final Fantasy', 'March 14']
['Champagne Poetry', 'Papi’s Home', 'Girls Want Girls', 'In The Bible', 'Love All', 'Fair Trade', 'Way 2 Sexy', 'TSU', 'N 2 Deep', 'Pipe Down', 'Yebba’s Heartbreak', 'No Friends In The Industry', 'Knife Talk', '7am on Bridle Path', 'Race My Mind', 'Fountains', 'Get Along Better', 'You Only Live Twice', 'IMY2', 'Fucking Fans', 'The Remorse']


# Getting song objects for each song name
def getSongObjects(song_names):
  song_objects = []
  for song in song_names:
    song_objects.append(genius.search_song(song, "Drake"))
  return song_objects

too_late_songs = getSongObjects(too_late_song_names)
take_care_songs = getSongObjects(take_care_song_names)
views_songs = getSongObjects(views_song_names)
more_life_songs = getSongObjects(more_life_song_names)
scorpion_songs = getSongObjects(scorpion_song_names)
clb_songs = getSongObjects(clb_song_names)


# Getting song lyrics from each song object
def getLyrics(song_objects):
  song_lyrics = []
  for song in song_objects:
    song_lyrics.append(song.lyrics)
  return song_lyrics

too_late_lyrics = getLyrics(too_late_songs)
take_care_lyrics = getLyrics(take_care_songs)
views_lyrics = getLyrics(views_songs)
more_life_lyrics = getLyrics(more_life_songs)
scorpion_lyrics = getLyrics(scorpion_songs)
clb_lyrics = getLyrics(clb_songs)


# Testing code by printing part of a song
print(too_late_lyrics[1][0:200])

Energy Lyrics[Produced by Boi-1da & OB O'Brien]

[Intro: DJ Jah Walton & Drake]
Lickwood means "rewind" and gunshot means "forward"
You requested it, so we rewind
Yeah, way, way, way up
Turn it all up


# Puts all of the lyrics in one data frame
data = pd.DataFrame(too_late_lyrics + scorpion_lyrics + clb_lyrics 
                    + take_care_lyrics + views_lyrics + more_life_lyrics)
data.columns = ["lyrics"]

# Puts the lyrics in dataframes grouped by individual album
too_late_db = pd.DataFrame(too_late_lyrics)
too_late_db.columns = ["lyrics"]
scorpion_db = pd.DataFrame(scorpion_lyrics)
scorpion_db.columns = ["lyrics"]
clb_db = pd.DataFrame(clb_lyrics)
clb_db.columns = ["lyrics"]
take_care_db = pd.DataFrame(take_care_lyrics)
take_care_db.columns = ["lyrics"]
views_db = pd.DataFrame(views_lyrics)
views_db.columns = ["lyrics"]
more_life_db = pd.DataFrame(more_life_lyrics)
more_life_db.columns = ["lyrics"]


data["sentences"] = data["lyrics"].apply(sent_tokenize)
too_late_db["sentences"] = too_late_db["lyrics"].apply(sent_tokenize)
scorpion_db["sentences"] = scorpion_db["lyrics"].apply(sent_tokenize)
clb_db["sentences"] = clb_db["lyrics"].apply(sent_tokenize)
take_care_db["sentences"] = take_care_db["lyrics"].apply(sent_tokenize)
views_db["sentences"] = views_db["lyrics"].apply(sent_tokenize)
more_life_db["sentences"] = more_life_db["lyrics"].apply(sent_tokenize)


# This tokenizes our song lyrics after preprocessing
def generate_tokens(song):
  new_text = re.sub("[^A-Za-z]"," ", song)
  tokens = new_text.lower().split()
  tokens = [el for el in tokens if el not in sw]
  # These words were dominating the topics and do not carry much meaning, so we remove them
  tokens = [el for el in tokens if el not in set(["yeah", "know", "like", "woah", "chorus", "verse", "produced", 
                                                  "lyrics", "get", "got", "say", "drake", "gotta", "tryna", "oh","pre", "intro"])]
  return " ".join(tokens).split()

# This version of generate tokens joins the tokens back together at the end, resulting in one string rather than a list of tokens 
def generate_tokens_2(song):
  new_text = re.sub("[^A-Za-z]"," ", song)
  t = new_text.lower().split()
  t = [el for el in t if el not in sw]
  # These words were dominating the topics and do not carry much meaning, so we remove them
  t = [el for el in t if el not in set(["yeah", "know", "like", "woah", "chorus", "verse", "produced", 
                                        "lyrics", "get", "got", "say", "drake", "gotta", "tryna", "oh", "pre", "intro"])]
  return " ".join(t)

# This version of generate tokens takes in a list of sentences instead
def generate_tokens_3(sentence_list):
    final_tokens = []
    for sentence in sentence_list:
        new_text = re.sub("[^A-Za-z]"," ", sentence)
        tokens = new_text.lower().split()
        tokens = [el for el in tokens if el not in sw]
        if tokens!=[]:
            final_tokens.append(tokens)
    return final_tokens

# We import a list of stopwords and remove the ones we think may actually be relevant
sw = set(stopwords.words("english"))
sw.remove("against")
sw.remove("below")
sw.remove("he")
sw.remove("her")
sw.remove("herself")
sw.remove("himself")
sw.remove("his")
sw.remove("own")
sw.remove("ours")

# We apply our generate_tokens functions to obtain clean tokens for our lyrics in each data frame
data["clean_tokens"] = data["lyrics"].apply(generate_tokens)
data["clean_tokens_2"] = data["sentences"].apply(generate_tokens_3)
too_late_db["clean_tokens"] = too_late_db["lyrics"].apply(generate_tokens)
scorpion_db["clean_tokens"] = scorpion_db["lyrics"].apply(generate_tokens)
clb_db["clean_tokens"] = clb_db["lyrics"].apply(generate_tokens)
take_care_db["clean_tokens"] = take_care_db["lyrics"].apply(generate_tokens)
views_db["clean_tokens"] = views_db["lyrics"].apply(generate_tokens)
more_life_db["clean_tokens"] = more_life_db["lyrics"].apply(generate_tokens)


""" We wanted to analyze the frequencies of the words that appeared in each album.
    We created a total dict and a dict for each of the albums to be able to 
    compare the albums with each other. We just went through all the words in 
    the tokens and counted each unique word
"""
total_freq = {}
take_care_freq = {}
too_late_freq = {}
views_freq ={}
more_life_freq = {}
scorpion_freq ={}
clb_freq = {}

for i, r in data.iterrows():
    for word in r["clean_tokens"]:
        if word in total_freq:
            total_freq[word] += 1
        else:
            total_freq[word] = 1

for i, r in take_care_db.iterrows():
    for word in r["clean_tokens"]:
        if word in take_care_freq:
            take_care_freq[word] += 1
        else:
            take_care_freq[word] = 1

for i, r in too_late_db.iterrows():
    for word in r["clean_tokens"]:
        if word in too_late_freq:
            too_late_freq[word] += 1
        else:
            too_late_freq[word] = 1

for i, r in views_db.iterrows():
    for word in r["clean_tokens"]:
        if word in views_freq:
            views_freq[word] += 1
        else:
            views_freq[word] = 1

for i, r in more_life_db.iterrows():
    for word in r["clean_tokens"]:
        if word in more_life_freq:
            more_life_freq[word] += 1
        else:
            more_life_freq[word] = 1

for i, r in scorpion_db.iterrows():
    for word in r["clean_tokens"]:
        if word in scorpion_freq:
            scorpion_freq[word] += 1
        else:
            scorpion_freq[word] = 1

for i, r in clb_db.iterrows():
    for word in r["clean_tokens"]:
        if word in clb_freq:
            clb_freq[word] += 1
        else:
            clb_freq[word] = 1


""" Since we're focusing on the general frequency, we decided that we only want to 
    pay attention to the most frequent words and see if there is a correlation there
    that would support our hypothesis. We chose 35 words for each album because 
    that was the amount that removed a lot of filler words such as "yuh" or "ayy"
    which don't inheritely have any meaning.
"""
total_freq = {key:val for key, val in total_freq.items() if val >= 100}
take_care_freq = {key:val for key, val in take_care_freq.items() if val >= 35}
too_late_freq = {key:val for key, val in too_late_freq.items() if val >= 35}
views_freq = {key:val for key, val in views_freq.items() if val >= 35}
more_life_freq = {key:val for key, val in more_life_freq.items() if val >= 35}
scorpion_freq = {key:val for key, val in scorpion_freq.items() if val >= 35}
clb_freq = {key:val for key, val in clb_freq.items() if val >= 35}


# This graph shows all the frequencies of the take care album
words = list(take_care_freq.keys())
values = list(take_care_freq.values())
plt.bar(range(len(take_care_freq)), values, tick_label=words)

plt.xticks(rotation = 90)
plt.title("Frequency of Words in Take Care Album")
plt.xlabel("Words")
plt.ylabel("Number of Words")
plt.show()


# This graph shows all the frequencies of the too late album
words = list(too_late_freq.keys())
values = list(too_late_freq.values())
plt.bar(range(len(too_late_freq)), values, tick_label=words)

plt.xticks(rotation = 90)
plt.title("Frequency of Words in If You're Reading This It's Too Late Album")
plt.xlabel("Words")
plt.ylabel("Number of Words")
plt.show()


# This graph shows all the frequencies for the views album
words = list(views_freq.keys())
values = list(views_freq.values())
plt.bar(range(len(views_freq)), values, tick_label=words)

plt.xticks(rotation = 90)
plt.title("Frequency of Words in Views Album")
plt.xlabel("Words")
plt.ylabel("Number of Words")
plt.show()


# This graph shows all the frequencies for the more life album
words = list(more_life_freq.keys())
values = list(more_life_freq.values())
plt.bar(range(len(more_life_freq)), values, tick_label=words)

plt.xticks(rotation = 90)
plt.title("Frequency of Words in More Life Album")
plt.xlabel("Words")
plt.ylabel("Number of Words")
plt.show()


# This graph shows all the frequencies of the scorpion album
words = list(scorpion_freq.keys())
values = list(scorpion_freq.values())
plt.bar(range(len(scorpion_freq)), values, tick_label=words)
plt.title("Frequency of Words in Scorpion Album")
plt.xlabel("Words")
plt.ylabel("Number of Words")
plt.show()


# This graph shows all the frequencies of the clb album
words = list(clb_freq.keys())
values = list(clb_freq.values())
plt.bar(range(len(clb_freq)), values, tick_label=words)

plt.xticks(rotation = 90)
plt.title("Frequency of Words in Certified Lover Boy Album")
plt.xlabel("Words")
plt.ylabel("Number of Words")
plt.show()


# These are the total frequencies of all the albums and they're plotted accordingly
words = list(total_freq.keys())
values = list(total_freq.values())
plt.bar(range(len(total_freq)), values, tick_label=words)

plt.xticks(rotation = 90)
plt.title("Frequency of Words in All Albums")
plt.xlabel("Words")
plt.ylabel("Number of Words")
plt.show()
print(len(total_freq.keys()))
print(total_freq.keys())

38
dict_keys(['nigga', 'tell', 'back', 'girl', 'life', 'way', 'make', 'fuck', 'em', 'niggas', 'right', 'go', 'shit', 'one', 'feel', 'let', 'embed', 'look', 'take', 'real', 'never', 'still', 'man', 'see', 'wanna', 'time', 'even', 'said', 'things', 'love', 'need', 'want', 'her', 'ayy', 'think', 'cause', 'come', 'baby'])


# Generating 300-dimensional word embeddings using a window size of 5 using skip gram method
model = Word2Vec(data["clean_tokens_2"].sum(), size = 300, sg = 1, window = 5, min_count = 1, seed=1111)


# Finding the 20 words most similar to "man" in the corpus 
model.wv.most_similar(positive=["man"], topn=20)

[('coming', 0.9993363618850708),
 ('zone', 0.9993249177932739),
 ('drizzy', 0.9993224143981934),
 ('dangerous', 0.9993219971656799),
 ('loved', 0.9993082880973816),
 ('proud', 0.9993046522140503),
 ('dead', 0.999297559261322),
 ('comin', 0.9992589950561523),
 ('rihanna', 0.9992565512657166),
 ('none', 0.9992398023605347),
 ('race', 0.9992387294769287),
 ('trouble', 0.9992361664772034),
 ('bridge', 0.9992058277130127),
 ('worse', 0.9992009401321411),
 ('wanted', 0.9991815686225891),
 ('matter', 0.9991762638092041),
 ('summer', 0.9991721510887146),
 ('deserve', 0.9991694092750549),
 ('hope', 0.999164879322052),
 ('follow', 0.999147891998291)]


# Projecting 300 dimensions onto 2 dimensional plane using TSNE
vocab = list(model.wv.vocab)
X = model[vocab]
tsne = TSNE(n_components=2,random_state=1234)
X_tsne = tsne.fit_transform(X)
scatter_df = pd.DataFrame(X_tsne, index=vocab, columns=['x', 'y'])


# Plotting the 2 dimensional figure
fig = plt.figure(figsize=(14, 12))
ax = fig.add_subplot(1, 1, 1)
ax.scatter(scatter_df['x'], scatter_df['y'])

# Labeling some key points including "women" and "men"
for word, pos in scatter_df.iterrows():
  if word in ["man", "woman", "dedicated", "time", "money", "preach", "want"]:
    ax.annotate(word, pos)
plt.title("2D Projection of 300D Word Embedding")
plt.xlabel("X Coordinate on 2D Plane")
plt.ylabel("Y Coordinate on 2D Plane")
plt.show()


# Getting the dataset

# https://www.harpersbazaar.com/culture/art-books-music/a18832473/best-feminist-women-empowerment-songs/
training_song_names_women = [ ("Run the World (Girls)", "Beyoncé"), 
                              ("You Don’t Own Me", "Lesley Gore"), 
                              ("Bad Reputation", "Joan Jett"), 
                              ("Bad Girls", "M.I.A."), 
                              ("Q.U.E.E.N.", "Janelle Monáe") , 
                              ("Independent Women - Part 1", "Destiny’s Child") , 
                              ("WAP", "Cardi B"), ("***Flawless", "Beyoncé"), 
                              ("Woman", "Kesha"), 
                              ("Doves in the Wind", "SZA"), ("Nasty", "Janet Jackson"), 
                              ("Respect", "Aretha Franklin"),  
                              ("Girl on Fire", "Alicia Keys"), ("Quiet", "MILCK"), 
                              ("Born This Way", "Lady Gaga"), ("Bodak Yellow", "Cardi B"), 
                              ("Just a Girl", "No Doubt"), ("Don’t Touch My Hair", "Solange"), 
                              ("Just Fine", "Mary J. Blige"), ("U.N.I.T.Y.", "Queen Latifah"), 
                              ("No Scrubs (With Rap)", "TLC"), ("PYNK", "Janelle Monáe"), 
                              ("Juice", "Lizzo"), ("​God is a woman", "Ariana Grande"),
                              ("Pussy Is God", "King Princess"), 
                              ("Girls Need Love", "Summer Walker"), 
                              ("Asexual Wellbeing", "Okay Kaya"), ("Poppin", "Rico Nasty"), 
                              ("Man! I Feel like a Woman!", "Shania Twain"), 
                              ("Girl Blunt", "Leikeli47"), ("In the Party", "Flo Milli")
                              ]

training_song_objects_women = []
for song in training_song_names_women:
  training_song_objects_women.append(genius.search_song(song[0], song[1]))

training_song_lyrics_women = getLyrics(training_song_objects_women)

# https://www.flavorwire.com/489278/25-of-musics-most-obnoxiously-misogynist-songs
# https://www.bustle.com/articles/137558-12-songs-that-are-actually-full-of-super-misogynistic-lyrics
training_song_names_men = [ ("It’s So Easy", "Guns N’ Roses"), ("Wrong Way", "Sublime"), 
                            ("Nookie", "Limp Bizkit"), ("A Man Needs a Maid", "Neil Young"),
                            ("Kim", "Eminem"), 
                            ("Tell That Mick He Just Made My List of Things to Do Today", 
                             "Fall Out Boy"), 
                            ("He Hit Me (It Felt Like a Kiss)", "The Crystals"), 
                            ("Triad (Demo) [2021 Remaster]", "David Crosby"),
                            ("Dominos", "The Big Pink"), ("Every Breath You Take", "The Police"), 
                            ("Me So Horny", "2 Live Crew"), ("Beware of Young Girls", "Dory Previn"),
                            ("Cocaine Blues (At Folsom Prison)", "Johnny Cash"), 
                            ("A Bitch Iz a Bitch", "N.W.A"), ("Under My Thumb", "The Rolling Stones"), 
                            ("Wild World", "Cat Stevens"),
                            ("Ain’t No Fun (If the Homies Can’t Have None)", "Snoop Dogg") , 
                            ("Run for Your Life", "The Beatles"), ("Blurred Lines", "Robin Thicke"), 
                            ("Follow You Home", "Nickelback"), 
                            ("I Will Possess Your Heart", "Death Cab for Cutie"), 
                            ("U.O.E.N.O", "Rocko"),
                            ("Outer Space", "Danny Brown"), ("Gold Digger", "Kanye West"), 
                            ("Better Than Revenge", "Taylor Swift"),
                            ("So Much Better", "Eminem"), ("Fine China", "Chris Brown"), 
                            ("Talk Dirty - Tribute to Jason Derulo and 2 Chainz", "Talk Dirty"), 
                            ("Bitches Ain’t Shit", "Dr. Dre"),
                            ("Love Game", "Eminem"), ("One Less Bitch", "N.W.A")
                            ]

training_song_objects_men = []
for song in training_song_names_men:
  training_song_objects_men.append(genius.search_song(song[0], song[1]))

training_song_lyrics_men = getLyrics(training_song_objects_men)

# Creating a list of classified labels for the corresponding songs (1 for women, 0 for men)
training_song_classifier = []
i = 0
while i < len(training_song_lyrics_women):
  training_song_classifier.append(1)
  i += 1
i = 0
while i < len(training_song_lyrics_men):
  training_song_classifier.append(0)
  i += 1

# Making a data frame for the test set
data_train = pd.DataFrame(training_song_lyrics_women + training_song_lyrics_men)
data_train.columns = ["lyrics"]
data_train["classification"] = training_song_classifier


# Cleaning and tokenizing the text
data_train["clean_tokens"] = data_train["lyrics"].apply(generate_tokens_2)
data_train.head()


# Getting the TF-IDF matrix
vect = TfidfVectorizer()
vect.fit(data_train["clean_tokens"])
songs_dtm = vect.transform(data_train["clean_tokens"])
songs_dtm_df = pd.DataFrame(songs_dtm.toarray(),
columns = vect.get_feature_names())


# Creating the training and testing sets
X_train = songs_dtm_df.values
y_train = data_train["classification"]
# Verifying that our training and testing sets are the proper size
print(X_train.shape)
print(y_train.shape)
# This makes sense because we have 62 test songs

(62, 2848)
(62,)


# Running the Linear Support Vector Machine
SVM = svm.SVC(kernel = "linear") 
SVM.fit(X_train, y_train)

SVC(kernel='linear')


# Testing the model (in our case we are interested in the # of Drake's songs classified as for men and # for women)
def predict_new_song(lyrics, model):
    cleaned_tokens = generate_tokens_2(lyrics)
    vect_txt = vect.transform([cleaned_tokens]).toarray()
    if model.predict(vect_txt)[0] == 0:
      return("men")
    elif model.predict(vect_txt)[0] == 1:
      return("women")
    else:
      return("ERROR")
  
drake_songs_for_men = 0
drake_songs_for_women = 0

for drake_song in data["lyrics"]:
  if predict_new_song(drake_song, SVM) == "women":
    drake_songs_for_women += 1
  elif predict_new_song(drake_song, SVM) == "men":
    drake_songs_for_men += 1

print("Number of Drake Songs for Women: " + str(drake_songs_for_women))
print("Number of Drake Songs for Men: " + str(drake_songs_for_men))
print("Proportion of Drake Songs for Women: " + str(drake_songs_for_women/len(data["lyrics"])))
print("Proportion of Drake Songs for Men: " + str(drake_songs_for_men/len(data["lyrics"])))

Number of Drake Songs for Women: 55
Number of Drake Songs for Men: 75
Proportion of Drake Songs for Women: 0.4230769230769231
Proportion of Drake Songs for Men: 0.5769230769230769


# Calculating the mean (n * p) and the standard deviation (sqrt(n * p * (1-p)))
mean = 130 * 0.75
sd = math.sqrt(130 * 0.75 * (1-0.25))
print("Mean: " + str(mean))
print("Standard Deviation: " + str(sd))

Mean: 97.5
Standard Deviation: 8.551315688243536


# These functions were obtained from the lecture code

# We use this function to create our cumulative distribution function given the mean and sd
def normal_cdf(x: float, mu: float = 0, sigma: float = 1) -> float:
    return (1 + math.erf((x - mu) / np.sqrt(2) / sigma)) / 2

# We use this function to find the left tail probability that we observe a value at or below the lo value
def norm_below(lo: float, mu: float = 0, sigma: float = 1) -> float:
    return normal_cdf(lo, mu, sigma)


# The probability that we got the results we did given that the null hypothesis is true
print(norm_below(55, mean, sd))

3.3477237665335835e-07

	lyrics	classification	clean_tokens
0	Run the World (Girls) Lyrics[Intro]\nGirls, we...	1	run world girls girls run mutha girls run muth...
1	You Don’t Own Me Lyrics[Verse 1]\nYou don't ow...	1	own own one many toys own go boys tell tell pl...
2	Bad Reputation Lyrics[Verse 1]\nI don't give a...	1	bad reputation give damn bout reputation livin...
3	Bad Girls Lyrics[Chorus]\nLive fast, die young...	1	bad girls live fast die young bad girls well l...
4	Q.U.E.E.N. Lyrics[Intro: Janelle Monáe]\nGirl,...	1	q u e e n janelle mon e girl crazy let tell ja...

Data Science Tutorial¶

Sahil Goel, Salma Khairat, and Saima Ahmed¶

Preliminary Step¶

Obtaining Drake's Song Lyrics¶

Cleaning and Organizing the Data¶

Tokenizing the Song Lyrics¶

Examining Word Frequencies By Album (In Chronological Order)¶

Word Embeddings on All 6 Albums (using Word2Vec)¶

Linear Support Vector Machine on All 6 Albums¶

Hypothesis Testing¶

Conclusion¶