import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from ast import literal_eval
from scipy import stats
from wordcloud import WordCloud

 imdb_metadata = pd.read_csv('datasets/IMDB metadata.csv', index_col=0,na_values="\\N")
print(imdb_metadata.tconst.value_counts().count())
imdb_metadata.head()

C:\Users\kaiwen\AppData\Local\Temp\ipykernel_23184\2907941367.py:1: DtypeWarning: Columns (8) have mixed types. Specify dtype option on import or set low_memory=False.
  imdb_metadata = pd.read_csv('datasets/IMDB metadata.csv', index_col=0,na_values="\\N")

9126554

 imdb_directors_writers = pd.read_csv('datasets/IMDB director writers.csv', index_col=0,na_values="\\N")
imdb_directors_writers.head()

 imdb_ratings = pd.read_csv('datasets/IMDB ratings.csv', index_col=0,na_values="\\N")
print(imdb_ratings.tconst.value_counts().count())
imdb_ratings.head()

1258745

 imdb_names = pd.read_csv("datasets/IMDB names.csv", na_values="\\N", index_col=0)
imdb_names.head()

 imdb_cast_crew = pd.read_csv("datasets/IMDB principal cast crew.csv", na_values="\\N", index_col=0)
imdb_cast_crew.head()

 tmdb = pd.read_excel("datasets/tmdb.xlsx", index_col=0, header=0, na_values="\\N")
tmdb

 language_codes = pd.DataFrame([pd.read_json("datasets/language-codes.json",  typ='series')]).T
language_codes

 language_codes.insert(0,"code", language_codes.index)
language_codes.columns = ["code", "country"]
languages_codes = language_codes.reset_index(drop=True)
languages_codes

 # IMDB contains 9 million entries, however, not all of them are movies. IMDB tells us the type of entry under "titleType".
# As we can see, most of the entries are TV episodes and only 617619 are movies.
# We will drop all the entries that are not movies. 
# We will also delete the titleType and endYear columns. Movies do not have an endYear, only tv episodes or other such entries have endYears.

print(imdb_metadata.titleType.value_counts())
imdb_movie_metadata = (imdb_metadata.loc[imdb_metadata.titleType == "movie"]).reset_index(drop=True)
del imdb_movie_metadata["titleType"]
del imdb_movie_metadata["endYear"]
imdb_movie_metadata

tvEpisode       6871485
short            882541
movie            617619
video            263832
tvSeries         229177
tvMovie          137074
tvMiniSeries      44596
tvSpecial         37840
videoGame         31760
tvShort           10628
tvPilot               2
Name: titleType, dtype: int64

 # here, we merge the IMDB datasets together based on tconst (IMDB id)

imdb = pd.merge(imdb_movie_metadata, imdb_ratings, how="left", on="tconst")
imdb = pd.merge(imdb, imdb_directors_writers, how="left", on="tconst")
imdb.info()
imdb.runtimeMinutes = pd.to_numeric(imdb.runtimeMinutes) #change type of runtimeMinutes to float
imdb

<class 'pandas.core.frame.DataFrame'>
Int64Index: 617619 entries, 0 to 617618
Data columns (total 11 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   tconst          617619 non-null  object 
 1   primaryTitle    617619 non-null  object 
 2   originalTitle   617619 non-null  object 
 3   isAdult         617619 non-null  float64
 4   startYear       536060 non-null  float64
 5   runtimeMinutes  389711 non-null  object 
 6   genres          545941 non-null  object 
 7   averageRating   280695 non-null  float64
 8   numVotes        280695 non-null  float64
 9   directors       552359 non-null  object 
 10  writers         467056 non-null  object 
dtypes: float64(4), object(7)
memory usage: 56.5+ MB

 # Dealing with missing values in IMDB datasets
# I drop rows with null entries in startYear, runtimeMinutes, genres, avgrating and numvotes
# This is movies have missing values in these columns typically because they are not so well known, or were released from a very long time ago, and we are not interested in analysing these movies.
# Also, I have a lot of data already and I do not need so much 

imdb = imdb.dropna(subset=["genres", "startYear", "runtimeMinutes", "averageRating", "numVotes"]).reset_index(drop=True)

# Use info and describe to check our dataset
imdb.info()
imdb.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 246764 entries, 0 to 246763
Data columns (total 11 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   tconst          246764 non-null  object 
 1   primaryTitle    246764 non-null  object 
 2   originalTitle   246764 non-null  object 
 3   isAdult         246764 non-null  float64
 4   startYear       246764 non-null  float64
 5   runtimeMinutes  246764 non-null  float64
 6   genres          246764 non-null  object 
 7   averageRating   246764 non-null  float64
 8   numVotes        246764 non-null  float64
 9   directors       244834 non-null  object 
 10  writers         221119 non-null  object 
dtypes: float64(5), object(6)
memory usage: 20.7+ MB

 # Make sure that we do not include documentaries in our dataset, as they are not movies
imdb = imdb[imdb['genres'].str.contains('Documentary') == False]

# As seen from .describe(), there is something wrong with runtime minutes as the minimum time is 1min, which is clearly not a legitimate movie
# According to the Academy of Motion Picture Arts and Sciences, the American Film Institute and the British Film Institute, 
# a feature film is one that runs for more than 40 minutes
# Hence, we will drop all movies which are <= 40min
imdb = imdb[imdb['runtimeMinutes'] > 40]

# We are only interested in movies which are not obscure.
# Some movies in IMDB dataset only have a few votes, which can skew our results.
# Hence, we will drop all movies which have less than 1000 votes.

imdb = imdb[imdb['numVotes'] >= 1000]

imdb.info()
imdb = imdb.sort_values(by = "startYear").reset_index(drop=True)
imdb

<class 'pandas.core.frame.DataFrame'>
Int64Index: 35935 entries, 16 to 246762
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   tconst          35935 non-null  object 
 1   primaryTitle    35935 non-null  object 
 2   originalTitle   35935 non-null  object 
 3   isAdult         35935 non-null  float64
 4   startYear       35935 non-null  float64
 5   runtimeMinutes  35935 non-null  float64
 6   genres          35935 non-null  object 
 7   averageRating   35935 non-null  float64
 8   numVotes        35935 non-null  float64
 9   directors       35842 non-null  object 
 10  writers         35658 non-null  object 
dtypes: float64(5), object(6)
memory usage: 3.3+ MB

 # imdb.to_excel("imdb_cleaned.xlsx")

 # merge tmdb and imdb datasets
imdb.columns = ["imdb_id"] + list(imdb.columns[1:])
all_mdb = imdb.merge(tmdb, on = "imdb_id", how = "left")
all_mdb

 # Dealing with missing values in TMDB dataset
all_mdb.info()

# NaN in "Status" column means that there was no result for that entry when we webscraped TMDB
# As there are only ~200 NaNs in "Status", we can drop these entries

all_mdb = all_mdb.dropna(subset = ["status"]).reset_index(drop=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 35935 entries, 0 to 35934
Data columns (total 21 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   imdb_id               35935 non-null  object 
 1   primaryTitle          35935 non-null  object 
 2   originalTitle         35935 non-null  object 
 3   isAdult               35935 non-null  float64
 4   startYear             35935 non-null  float64
 5   runtimeMinutes        35935 non-null  float64
 6   genres                35935 non-null  object 
 7   averageRating         35935 non-null  float64
 8   numVotes              35935 non-null  float64
 9   directors             35842 non-null  object 
 10  writers               35658 non-null  object 
 11  status                35776 non-null  object 
 12  budget                35776 non-null  float64
 13  revenue               35776 non-null  float64
 14  release_date          35762 non-null  object 
 15  original_language     35776 non-null  object 
 16  popularity            35776 non-null  float64
 17  production_companies  35776 non-null  object 
 18  production_countries  35776 non-null  object 
 19  spoken_languages      35776 non-null  object 
 20  keywords              35776 non-null  object 
dtypes: float64(8), object(13)
memory usage: 6.0+ MB

 # Data cleaning the data webscraped from TMDB, turning strings into lists

# Turn genres, directors, writers into lists.

all_mdb.loc[:,"genres"] = all_mdb.loc[:,"genres"].str.split(",")
all_mdb.loc[:,"directors"] = all_mdb.loc[:,"directors"].str.split(",")
all_mdb.loc[:,"writers"] = all_mdb.loc[:,"writers"].str.split(",")

# I webscraped the following columns as lists, but
# Pandas stores lists as strings in a csv, which is what we read from, so we must convert it back to a list using literal_eval, which is safer than eval
# We do this for production companies, production countries, spoken languages, and keywords.

# As literal_eval can't process NaN values, we convert it to an empty list. After the eval, we convert the empty lists back to NaN.

all_mdb.loc[:, "production_companies"] = all_mdb.loc[:, "production_companies"].replace(np.nan, "[]").apply(lambda x: literal_eval(str(x)))
all_mdb.loc[:, "production_companies"].loc[all_mdb.loc[:, "production_companies"].apply(lambda x: x == [])] = np.nan

all_mdb.loc[:, "production_countries"] = all_mdb.loc[:, "production_countries"].replace(np.nan, "[]").apply(lambda x: literal_eval(str(x)))
all_mdb.loc[:, "production_countries"].loc[all_mdb.loc[:, "production_countries"].apply(lambda x: x == [])] = np.nan

all_mdb.loc[:, "spoken_languages"] = all_mdb.loc[:, "spoken_languages"].replace(np.nan, "[]").apply(lambda x: literal_eval(str(x)))
all_mdb.loc[:, "spoken_languages"].loc[all_mdb.loc[:, "spoken_languages"].apply(lambda x: x == [])] = np.nan

all_mdb.loc[:, "keywords"] = all_mdb.loc[:, "keywords"].replace(np.nan, "[]").apply(lambda x: literal_eval(str(x)))
all_mdb.loc[:, "keywords"].loc[all_mdb.loc[:, "keywords"].apply(lambda x: x == [])] = np.nan

C:\Users\kaiwen\AppData\Local\Temp\ipykernel_23184\270989945.py:16: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  all_mdb.loc[:, "production_companies"].loc[all_mdb.loc[:, "production_companies"].apply(lambda x: x == [])] = np.nan
C:\Users\kaiwen\AppData\Local\Temp\ipykernel_23184\270989945.py:19: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  all_mdb.loc[:, "production_countries"].loc[all_mdb.loc[:, "production_countries"].apply(lambda x: x == [])] = np.nan
C:\Users\kaiwen\AppData\Local\Temp\ipykernel_23184\270989945.py:22: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  all_mdb.loc[:, "spoken_languages"].loc[all_mdb.loc[:, "spoken_languages"].apply(lambda x: x == [])] = np.nan
C:\Users\kaiwen\AppData\Local\Temp\ipykernel_23184\270989945.py:25: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  all_mdb.loc[:, "keywords"].loc[all_mdb.loc[:, "keywords"].apply(lambda x: x == [])] = np.nan

 all_mdb.info()
all_mdb.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35776 entries, 0 to 35775
Data columns (total 21 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   imdb_id               35776 non-null  object 
 1   primaryTitle          35776 non-null  object 
 2   originalTitle         35776 non-null  object 
 3   isAdult               35776 non-null  float64
 4   startYear             35776 non-null  float64
 5   runtimeMinutes        35776 non-null  float64
 6   genres                35776 non-null  object 
 7   averageRating         35776 non-null  float64
 8   numVotes              35776 non-null  float64
 9   directors             35683 non-null  object 
 10  writers               35525 non-null  object 
 11  status                35776 non-null  object 
 12  budget                35776 non-null  float64
 13  revenue               35776 non-null  float64
 14  release_date          35762 non-null  object 
 15  original_language     35776 non-null  object 
 16  popularity            35776 non-null  float64
 17  production_companies  33062 non-null  object 
 18  production_countries  34858 non-null  object 
 19  spoken_languages      35247 non-null  object 
 20  keywords              27977 non-null  object 
dtypes: float64(8), object(13)
memory usage: 5.7+ MB

 successful_movies = all_mdb.loc[(all_mdb.budget > 0) & (all_mdb.revenue > 0) & (all_mdb.revenue / all_mdb.budget >= 2.5)].reset_index(drop=True)
successful_movies.head()

 successful_movies.describe()

 successful_movies = successful_movies.loc[successful_movies.revenue >= 1000000].reset_index(drop=True)
successful_movies = successful_movies.loc[successful_movies.budget >= 500000].reset_index(drop=True)
successful_movies.describe()

 successful_movies

 # make two dataframe containing revenue and genre of movie, one with all movies and one with successful movies.
genres = pd.DataFrame([all_mdb.revenue, all_mdb.genres]).T.dropna()
genres = genres.loc[genres.revenue > 0].reset_index(drop=True)
genres = genres.explode('genres').reset_index(drop=True)

successful_movies_genres = pd.DataFrame([successful_movies.revenue, successful_movies.genres]).T.explode('genres').reset_index(drop=True)

genres.head()

 successful_movies_genres.head()

 successful_genres_percentage = (successful_movies_genres.genres.value_counts() / genres.genres.value_counts() * 100).round(decimals=2).to_frame().sort_values(by="genres", ascending=False)
successful_genres_percentage = successful_genres_percentage.replace(np.nan, 0) 
#As NaN means there is division by zero error, also there are no Adult movies in successful_movies

genres_percentage = (genres.genres.value_counts() / genres.genres.count() * 100).round(decimals = 2)
# percentage of genres in all movies

fig = plt.figure(figsize=(15,12))
ax0 = fig.add_subplot(1, 2, 1)

ax0 = sns.barplot(y=genres_percentage.index.values ,x=genres_percentage.values, orient='h')
for i in ax0.containers:
    ax0.bar_label(i,)
ax0.set_title("Percentage of frequency of genres in all movies")
ax0.set_xlabel("Percentage of frequency (%)")
ax0.set_ylabel("Genres")

ax1 = fig.add_subplot(1, 2, 2) 
ax1 = sns.barplot(y=successful_genres_percentage.index.values ,x=successful_genres_percentage.genres, orient='h')
for i in ax1.containers:
    ax1.bar_label(i,)
ax1.set_title("Percentage of successful movies for each genre")
ax1.set_xlabel("Percentage of successful movies (%)")
ax1.set_ylabel("Genres")
plt.show()

 # Mean revenue by genre
genre_mean = genres.groupby("genres").revenue.mean().sort_values(ascending=False)
genre_mean

genres
Adventure    1.598268e+08
Animation    1.460725e+08
Sci-Fi       1.331758e+08
Action       1.084569e+08
Fantasy      1.066539e+08
Family       8.922234e+07
Adult        7.166667e+07
Comedy       5.674864e+07
Thriller     5.457347e+07
Mystery      4.538046e+07
Crime        4.150252e+07
Biography    4.020940e+07
Music        3.992448e+07
History      3.864902e+07
Musical      3.694949e+07
Drama        3.673604e+07
Romance      3.665500e+07
Sport        3.609680e+07
Horror       3.484590e+07
Western      3.050453e+07
War          2.848186e+07
Film-Noir    3.780342e+06
Name: revenue, dtype: float64

 plt.figure(figsize=(20,12)) 
sns.pointplot( y=np.log10(genre_mean.values.astype('float')), x=genre_mean.index.values)
plt.title("Pointplot of mean revenue by genre")
plt.show()

#Sort boxplot by descending median
median_order = genres.groupby(by=["genres"])["revenue"].median().sort_values(ascending=False).index
plt.figure(figsize=(20,12)) 
sns.boxplot( y=np.log10(genres["revenue"].astype('float')), x=genres["genres"], order = median_order)
plt.title("Boxplot of revenue by genre")
plt.show()

 factor_revenue = all_mdb   # drop all with 0 revenue, as these may skew our results
factor_revenue.revenue = factor_revenue.revenue.replace(0,np.nan)
factor_revenue = factor_revenue.dropna(subset=["revenue"])

 def reg_residual_subplot(x_name, y_name, x_df,y_df, x_lim, y_lim):
    # transparent = True or False
    fig = plt.figure(figsize=(20,4))
    ax0 = fig.add_subplot(1, 2, 1)
    
    if (len(x_lim) == 0):
        x_lim = (None, None)
    if (len(y_lim) == 0):
        y_lim = (None, None)

        
    sns.regplot(x=x_df.astype(float), y=y_df.astype(float), scatter_kws={'alpha':0.3}, line_kws={'color': 'red'}, ax=ax0)
    ax0.set_xlim(x_lim)
    ax0.set_ylim(y_lim)
    ax0.set_xlabel(x_name)
    ax0.set_ylabel(y_name)
    ax0.set_title("Linear Regression Plot of " + x_name + " against " + y_name)
    
    ax1 = fig.add_subplot(1, 2, 2)
    
    sns.residplot(x=x_df.astype(float), y=y_df.astype(float), scatter_kws={'alpha':0.3}, ax=ax1)
    ax1.set_xlim(x_lim)
    ax1.set_ylim(y_lim)
    ax1.set_xlabel(x_name)
    ax1.set_ylabel(y_name)
    ax1.set_title("Residual Plot of " + x_name + " against " + y_name)
    
    coef = pd.DataFrame(x_df.astype(float))
    coef.columns = ["x"]
    coef["y"] = y_df.astype(float)
    coef = coef.dropna()
    
    pearson_coef, p_value = stats.pearsonr(coef["x"], coef["y"])
    print("Pearson's Coefficient is " + str(pearson_coef) + " P-value is " + str(p_value))
    
    plt.show()

 reg_residual_subplot("Budget", "Revenue", all_mdb.budget, all_mdb.revenue, (), ())

Pearson's Coefficient is 0.7467009218280032 P-value is 0.0

 reg_residual_subplot("Budget", "Revenue", factor_revenue.budget, factor_revenue.revenue, (), ())

Pearson's Coefficient is 0.7467009218280032 P-value is 0.0

 reg_residual_subplot("Budget", "Revenue", successful_movies.budget, successful_movies.revenue, (), ())

Pearson's Coefficient is 0.8524386941593549 P-value is 0.0

 reg_residual_subplot("Runtime", "Revenue", all_mdb.runtimeMinutes, all_mdb.revenue, (0,300), (0,1.0e9))

Pearson's Coefficient is 0.17891001036238946 P-value is 1.292560240450447e-79

 reg_residual_subplot("Runtime", "Revenue", factor_revenue.runtimeMinutes, factor_revenue.revenue, (0,300), (0,1.0e9))

Pearson's Coefficient is 0.17891001036238946 P-value is 1.292560240450447e-79

 reg_residual_subplot("Runtime", "Revenue", successful_movies.runtimeMinutes, successful_movies.revenue, (0,300),(0,1.5e9))

Pearson's Coefficient is 0.20170933194289647 P-value is 1.9362003135175291e-28

 def plot_correlation_map(df):
    corr = df.corr()
    _ , ax = plt.subplots( figsize =(12,10))
    cmap = sns.diverging_palette(240, 10, as_cmap = True )
    _ = sns.heatmap(corr,cmap = cmap,square=True, cbar_kws={ 'shrink' : .9 }, ax=ax, annot = True, annot_kws = { 'fontsize' : 12 })

 plot_correlation_map(all_mdb[['budget','revenue','runtimeMinutes','numVotes']])

 def barchart_genre_subplot(title,x_name,y_name, df, factor,all_movies): 
    # Factor: "Original Language"
    # all_movies: True, is using all movies dataset, False, is using successful movies dataset
    
    genres = ['Action', 'Adventure', 'Animation', 'Biography', 'Comedy',
       'Crime', 'Drama', 'Family', 'Fantasy', 'Film-Noir', 'History', 'Horror',
       'Music', 'Musical', 'Mystery', 'Romance', 'Sci-Fi',
       'Sport', 'Thriller', 'War', 'Western', 'Reality-TV', 'Adult']
    
    languages = ["English","Hindi","French","Japanese","Spanish","Russian","German","Italian","Danish","Korean", "Chinese"]
    colours = ["tab:blue","tab:orange","tab:green","tab:red","tab:purple","tab:brown","tab:pink","tab:gray","tab:olive","tab:cyan","black"]

    colours = pd.DataFrame(list(zip(languages, colours)), columns =["country","colours"])
     
    nrow = 4
    ncol = 6

    fig = plt.figure(figsize=(20,10))
    gs = fig.add_gridspec(nrow, ncol, hspace=0.2, wspace=0)
    axes = gs.subplots(sharex=True, sharey=True)
    
    count = 0

    for r in range(nrow):
        for c in range(ncol):
            
            if (factor == "Original Language"):
                a = df.loc[(genres[count])].dropna()
                if (title == "Percentage frequency of original language by genre in successful movies"):
                    a = a.loc[a.frequency > 5]
                elif (all_movies == False):
                    a = a.loc[a.frequency > 20]
                else:
                    a = a.head(3)
                if a.frequency.sum() == 0:
                    axes[r,c].set_title(genres[count])
                    axes[r,c].set_ylabel("") 
                    axes[r,c].set_xlabel("")            
                    count+=1
                    continue
                bar_colour = pd.merge(a, colours, on="country", how="left")
                a.T.plot(kind='bar',ax = axes[r,c],color = bar_colour.colours.tolist())

            axes[r,c].set_title(genres[count])
            axes[r,c].set_ylabel("") 
            axes[r,c].set_xlabel("")
            axes[r,c].get_legend().remove()
            
            for bars in axes[r,c].containers:
                axes[r,c].bar_label(bars)
            
            count+=1
            if (count == len(df.index.get_level_values(0).unique())):
                break
                
    for ax in axes.flat:
        ax.label_outer()

    lines = []
    labels = []
  
    for ax in fig.axes:
        Line, Label = ax.get_legend_handles_labels()
        lines.extend(Line)
        labels.extend(Label)
  
    labels_df = pd.DataFrame(list(zip(lines,labels)), columns =["lines","labels"])
    labels_df = labels_df.drop_duplicates(subset = "labels") 
    fig.legend(labels_df.lines, labels_df.labels, loc='upper right')
    
    fig.supxlabel(x_name)
    fig.supylabel(y_name)
    plt.suptitle(title)
        
    plt.show()

 all_original_lang = pd.DataFrame([all_mdb.original_language, all_mdb.genres]).T.dropna().explode("genres").reset_index(drop=True)
all_original_lang.columns = ["code", "genres"]
all_original_lang = all_original_lang.merge(language_codes, how="left", on="code")
all_original_lang = all_original_lang.groupby("genres").country.value_counts().to_frame()
all_original_lang.columns = ["frequency"]

percent_all_original_lang = (all_original_lang['frequency'] / all_original_lang.groupby('genres')['frequency'].transform('sum') * 100).round(decimals=2).to_frame()

successful_original_lang = pd.DataFrame([successful_movies.original_language, successful_movies.genres]).T.dropna().explode("genres").reset_index(drop=True)
successful_original_lang.columns = ["code", "genres"]
successful_original_lang = successful_original_lang.merge(language_codes, how="left", on="code")
successful_original_lang = successful_original_lang.groupby("genres").country.value_counts().to_frame()
successful_original_lang.columns = ["frequency"]

percent_successful_original_lang = (successful_original_lang['frequency'] / successful_original_lang.groupby('genres')['frequency'].transform('sum') * 100).round(decimals=2).to_frame()

 barchart_genre_subplot("Frequency of original language by genre in all movies", "Original Languages", "Frequency of language", all_original_lang, "Original Language", True)

 barchart_genre_subplot("Frequency of original language by genre in successful movies", "Original Languages", "Frequency of language", successful_original_lang, "Original Language", False)

 barchart_genre_subplot("Percentage frequency of original language by genre in all movies", "Original Languages", "Percentage frequency of language", percent_all_original_lang, "Original Language", True)

 barchart_genre_subplot("Percentage frequency of original language by genre in successful movies", "Original Languages", "Percentage frequency of language", percent_successful_original_lang, "Original Language", False)

 frequent_10_original_lang = all_mdb.loc[all_mdb.isin(all_mdb.original_language.value_counts().head(10).index.values).original_language]

# Change country codes to language name
language_codes.columns = ["original_language", "country"]
frequent_10_original_lang = pd.merge(frequent_10_original_lang, language_codes, how = "left", on="original_language")

# sort boxplots by descending median
median_order = frequent_10_original_lang.groupby("country")["revenue"].median().sort_values(ascending=False).index
sns.boxplot(x=frequent_10_original_lang.country, y=np.log10(frequent_10_original_lang.revenue).astype(float), order = median_order)
plt.xticks(rotation=90)
plt.xlabel("Language")
plt.show()

 #all_mdb.spoken_languages.dropna().reset_index(drop=True).to_frame().explode("spoken_languages")

top_10_original_lang = all_mdb.loc[all_mdb.isin(all_mdb.groupby("original_language")["revenue"].median().sort_values(ascending=False).head(10).index.values).original_language]

# Change country codes to language name
language_codes.columns = ["original_language", "country"]
top_10_original_lang = pd.merge(top_10_original_lang, language_codes, how = "left", on="original_language")

# sort boxplots by descending median
median_order = top_10_original_lang.groupby("country")["revenue"].median().sort_values(ascending=False).head(10).index
sns.boxplot(x=top_10_original_lang.country, y=np.log10(top_10_original_lang.revenue).astype(float), order = median_order)
plt.xticks(rotation=90)
plt.xlabel("Language")
plt.show()

 top_10_original_lang = all_mdb.loc[all_mdb.isin(all_mdb.groupby("original_language")["revenue"].median().sort_values(ascending=False).head(13).index.values).original_language]

# Change country codes to language name
language_codes.columns = ["original_language", "country"]
top_10_original_lang = pd.merge(top_10_original_lang, language_codes, how = "left", on="original_language")

# sort boxplots by descending median, remove the outliers
median_order = top_10_original_lang.groupby("country")["revenue"].median().sort_values(ascending=False).head(13).index.tolist()
median_order.remove("Abkhazian")
median_order.remove("Afrikaans")
median_order.remove("Malay")

sns.boxplot(x=top_10_original_lang.country, y=np.log10(top_10_original_lang.revenue).astype(float), order = median_order)
plt.xticks(rotation=90)
plt.xlabel("Original Language")
plt.show()

 spoken_languages = all_mdb.revenue.to_frame()
spoken_languages["spoken_languages"] = all_mdb.spoken_languages
spoken_languages = spoken_languages.dropna().explode("spoken_languages").reset_index(drop=True)

# Change country codes to language name
language_codes.columns = ["spoken_languages", "country"]
spoken_languages = pd.merge(spoken_languages, language_codes, how = "left", on="spoken_languages")

spoken_languages

 # sort boxplots by descending median
median_order = spoken_languages.groupby("country")["revenue"].median().sort_values(ascending=False).head(10).index.tolist()

sns.stripplot(x=spoken_languages.country, y=np.log10(spoken_languages.revenue).astype(float), order = median_order)
plt.xticks(rotation=90)
plt.xlabel("Spoken Language")
plt.show()

 # sort boxplots by descending median
filtered_spoken_languages = spoken_languages.loc[spoken_languages.isin(spoken_languages.groupby("country")["revenue"].count().loc[spoken_languages.groupby("country")["revenue"].count() >= 100].index).country]
median_order = filtered_spoken_languages.groupby("country")["revenue"].median().sort_values(ascending=False).index.tolist()

sns.boxplot(x=filtered_spoken_languages.country, y=np.log10(filtered_spoken_languages.revenue).astype(float), order = median_order)
plt.xticks(rotation=90)
plt.xlabel("Spoken Language")
plt.show()

 keywords = all_mdb.copy()
keywords.keywords = keywords.keywords.dropna()
keywords = keywords.explode("keywords").reset_index(drop=True)
keywords.keywords.value_counts()[keywords.keywords.value_counts() > 500].to_frame()

# There are 21074 total keywords, but most of them (20251) are under 100 uses.

 # Percentage Frequency of keywords in all movies
(keywords.keywords.value_counts()[keywords.keywords.value_counts() > 500] / keywords.keywords.count() * 100).round(decimals=2).to_frame()

 successful_keywords = successful_movies.copy()
successful_keywords.keywords = successful_keywords.keywords.dropna()
successful_keywords = successful_keywords.explode("keywords").reset_index(drop=True)

successful_keywords.keywords.value_counts().loc[successful_keywords.keywords.value_counts() >= 100].to_frame()
#There are 8559 total keywords, most of them (7948) are under 10 uses.

 # Percentage Frequency of keywords in successful movies
(successful_keywords.keywords.value_counts().loc[successful_keywords.keywords.value_counts() >= 100] / successful_keywords.keywords.count() * 100).round(decimals=2).to_frame()

 #Keywords with more than 50 uses in successful movies, ranked by mean revenue
good_keywords = successful_keywords.keywords.value_counts().loc[successful_keywords.keywords.value_counts() >= 50].index.tolist()
successful_keywords_revenue = successful_movies.loc[:,["revenue","keywords"]].explode("keywords").dropna().groupby("keywords").revenue.mean().sort_values(ascending=False).to_frame().reset_index()
successful_keywords_revenue.loc[successful_keywords_revenue .isin(good_keywords).keywords].head(10).reset_index(drop=True)

 genres_keywords = keywords.explode("genres").groupby("genres").keywords.value_counts().to_frame()
genres_keywords.columns = ["frequency"]
genres_keywords.head()

 successful_genres_keywords = successful_keywords.explode("genres").groupby("genres").keywords.value_counts().to_frame()
successful_genres_keywords.columns = ["frequency"]
successful_genres_keywords.head()

 def wordcloud_subplot(df, title):
    nrow = 4
    ncol = 6
    
    genres = ['Action', 'Adventure', 'Animation', 'Biography', 'Comedy',
       'Crime', 'Drama', 'Family', 'Fantasy', 'Film-Noir', 'History', 'Horror',
       'Music', 'Musical', 'Mystery', 'Romance', 'Sci-Fi',
       'Sport', 'Thriller', 'War', 'Western', 'Reality-TV', 'Adult']
    
    fig = plt.figure(figsize=(16,10))
    gs = fig.add_gridspec(nrow, ncol, hspace=0.2, wspace=0)
    axes = gs.subplots(sharex=True, sharey=True)
    
    count = 0

    for r in range(nrow):
        for c in range(ncol):
            
            genre = genres[count]
            wc = WordCloud(background_color="white")
            wc.generate_from_frequencies(df.loc[genre].sort_values(by="frequency",ascending=False).head(10).to_dict()["frequency"])
 
        #fig = plt.figure()
#fig.set_figwidth(14) # set width
#fig.set_figheight(18) # set height
 
            axes[r,c].imshow(wc, interpolation="bilinear")
            axes[r,c].axis("off")
                
            axes[r,c].set_title(genre)
            
            count+=1
            if (count == len(df.index.get_level_values(0).unique())):
                axes[r,c].axis("off")
                break
                
    for ax in axes.flat:
                ax.label_outer()
    
    plt.suptitle(title)
    
    plt.show()

 wordcloud_subplot(genres_keywords, "Word Clouds of popular keywords by genre in all movies")

 wordcloud_subplot(successful_genres_keywords, "Word Clouds of popular keywords by genre in successful movies")

 actor = imdb_cast_crew.loc[imdb_cast_crew.category == "actor"].loc[:,["tconst","nconst"]].reset_index(drop=True)
actor.columns = ["imdb_id", "nconst"]
actress = imdb_cast_crew.loc[imdb_cast_crew.category == "actress"].loc[:,["tconst","nconst"]].reset_index(drop=True)
actress.columns = ["imdb_id", "nconst"]

# Merge with All_MDB dataset to get budget, revenue, startYear for each entry
actor = pd.merge(actor,all_mdb, on="imdb_id").loc[:,["imdb_id","nconst","budget","revenue", "startYear"]]
actress = pd.merge(actress,all_mdb, on="imdb_id").loc[:,["imdb_id","nconst","budget","revenue", "startYear"]]

# drop entries with 0 budget - they are probably incorrect
actor.budget = actor.budget.replace(0,np.nan).dropna()
actor = actor.dropna().reset_index(drop=True)
actress.budget = actress.budget.replace(0,np.nan).dropna()
actress = actress.dropna().reset_index(drop=True)

# find the Net Profit: revenue - budget
actor["Net Profit"] = actor["revenue"] - actor["budget"]
actress["Net Profit"] = actress["revenue"] - actress["budget"]

# Merge with IMDB_names dataset to get the name of each actor, and their deathYear
actor = pd.merge(actor,imdb_names, on="nconst").loc[:,["imdb_id","nconst", "primaryName", "budget","revenue","Net Profit", "startYear", "deathYear"]]
actress = pd.merge(actress,imdb_names, on="nconst").loc[:,["imdb_id","nconst", "primaryName", "budget","revenue","Net Profit", "startYear", "deathYear"]]

# Drop entries where the movie is before 2000, and drop actors who are dead (with a non-NaN death year)
# This is as we want to see who we should hire, so the actors should ideally be alive and the movies they starred in should be recent
# Drop death year because it is all NaN values
actor = actor.loc[(actor.startYear >= 2000) & (actor.deathYear.isna())].loc[:,:"startYear"].reset_index(drop=True)
actress = actress.loc[(actress.startYear >= 2000) & (actress.deathYear.isna())].loc[:,:"startYear"].reset_index(drop=True)

# Only include actors who starred in more than 10 movies, so as to remove outliers which may skew the results
actor_count = actor.groupby(['primaryName'],  as_index=False)["Net Profit"].count()
actor = actor.loc[actor.isin(actor_count.loc[actor_count["Net Profit"] > 10].primaryName.values).primaryName].reset_index(drop=True)
actress_count = actress.groupby(['primaryName'],  as_index=False)["Net Profit"].count()
actress = actress.loc[actress.isin(actress_count.loc[actress_count["Net Profit"] > 10].primaryName.values).primaryName].reset_index(drop=True)

# Rank actors by VAR
actor_ranked = actor.groupby(['primaryName'],  as_index=False)["Net Profit"].mean().sort_values(by='Net Profit', ascending=False).reset_index(drop=True)
actor_ranked['VAR'] = (actor_ranked['Net Profit']/actor_ranked['Net Profit'].mean())
actress_ranked = actress.groupby(['primaryName'],  as_index=False)["Net Profit"].mean().sort_values(by='Net Profit', ascending=False).reset_index(drop=True)
actress_ranked['VAR'] = (actress_ranked['Net Profit']/actress_ranked['Net Profit'].mean())
# calculate net profit
# we only care about actors and actresses i guess

 fig = plt.figure(figsize=(20,10))

ax0 = fig.add_subplot(1, 2, 1)
ax0 = sns.barplot(x=(actor_ranked.VAR.head(20).astype(float)).round(decimals=2), y=actor_ranked.primaryName.head(20),orient="h")
for i in ax0.containers:
    ax0.bar_label(i,)
ax0.set_ylabel("")
ax0.set_xlabel("")
ax0.set_title("Actors")
    
ax1 = fig.add_subplot(1, 2, 2)
ax1 = sns.barplot(x=(actress_ranked.VAR.head(20).astype(float)).round(decimals=2), y=actress_ranked.primaryName.head(20),orient="h")
for i in ax1.containers:
    ax1.bar_label(i,)
ax1.set_ylabel("")
ax0.set_xlabel("")
ax1.set_title("Actresses")

plt.suptitle("VAR of top 20 actors by gender")
fig.supylabel("Name of actor")
fig.supxlabel("VAR")
plt.show()

 def VAR(job, title): # job = writers or directors
    profession = job
    
    job = all_mdb.loc[:,["imdb_id", job, "budget", "revenue", "startYear"]].explode(job)
    job.columns = ["imdb_id", "nconst", "budget", "revenue", "startYear"]
    job.budget = job.budget.replace(0,np.nan)
    job = job.dropna(subset=["budget"])
    job["Net Profit"] = job["revenue"] - job["budget"]
    
    job = pd.merge(job, imdb_names, on="nconst").loc[:,["imdb_id","nconst", "primaryName", "budget","revenue","Net Profit", "startYear", "deathYear"]]
    job = job.loc[(job.startYear >= 2000) & (job.deathYear.isna())].loc[:,:"startYear"].reset_index(drop=True)
    
    job_count = job.groupby(['primaryName'],  as_index=False)["Net Profit"].count()
    job = job.loc[job.isin(job_count.loc[job_count["Net Profit"] > 10].primaryName.values).primaryName].reset_index(drop=True)

    job_ranked = job.groupby(['primaryName'],  as_index=False)["Net Profit"].mean().sort_values(by='Net Profit', ascending=False).reset_index(drop=True)
    job_ranked['VAR'] = (job_ranked['Net Profit']/job_ranked['Net Profit'].mean())
    
    # graph
    plt.figure(figsize=(20,10))

    ax = sns.barplot(x=(job_ranked.VAR.head(20).astype(float)).round(decimals=2), y=job_ranked.primaryName.head(20),orient="h")
    for i in ax.containers:
        ax.bar_label(i,)

    plt.ylabel("Name of " + profession)
    plt.xlabel("VAR")
    plt.title(title)
    
    return job_ranked

 directors_ranked = VAR("directors", "VAR of top 20 directors")

 writers_ranked = VAR("writers", "VAR of top 20 writers")

 actor_ranked["Job"] = "Actors"
actress_ranked["Job"] = "Actresses"
directors_ranked["Job"] = "Directors"
writers_ranked["Job"] = "Writers"

job_ranked = pd.concat([actor_ranked, actress_ranked, directors_ranked, writers_ranked])

sns.boxplot(x = job_ranked.Job, y = job_ranked.VAR)

<AxesSubplot:xlabel='Job', ylabel='VAR'>

 sns.violinplot(x = job_ranked.Job, y = job_ranked.VAR)

<AxesSubplot:xlabel='Job', ylabel='VAR'>

 production_companies = all_mdb.loc[:,["production_companies", "revenue", "budget", "popularity", "averageRating"]].explode("production_companies").dropna().explode("production_companies").reset_index(drop=True).explode("production_companies")
companies = production_companies.loc[0::2]
countries = production_companies.loc[1::2]
countries.loc[:,"production_companies"] = countries.production_companies.replace("", np.nan)

production_companies = companies.copy()
production_companies.insert(1, "production_country", countries.production_companies.values)

# We only want to get the top 25 companies in terms of number of movies released, and drop the rest
companies_list = production_companies.groupby("production_companies", as_index=False).revenue.count().sort_values(by="revenue", ascending=False).reset_index(drop=True).head(25).production_companies.tolist()
production_companies = production_companies.loc[production_companies.isin(companies_list).production_companies].reset_index(drop=True)

companies_scatter = production_companies.groupby("production_companies", as_index=False).agg({'revenue':'mean','budget':'mean','popularity':'mean','averageRating':'mean'})

C:\Users\kaiwen\AppData\Local\Temp\ipykernel_23184\1100111065.py:4: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  countries.loc[:,"production_companies"] = countries.production_companies.replace("", np.nan)

 #plt.figure(figsize=(15,15))
fig = plt.figure(figsize=(15,15))
# plotting scatter plot
sc = sns.regplot(x=companies_scatter.revenue,y=companies_scatter.popularity)

# Loop for annotation of all points
for i in range(len(companies_scatter.revenue)):
    plt.annotate(companies_scatter.production_companies[i], (companies_scatter.revenue[i] + 0.4, companies_scatter.popularity[i] + 0.4))

pearson_coef, p_value = stats.pearsonr(companies_scatter.revenue, companies_scatter.popularity)
print("Pearson's Coefficient is " + str(pearson_coef) + " P-value is " + str(p_value))
plt.show()

Pearson's Coefficient is 0.8327430006416182 P-value is 2.415699718774758e-07

 sns.residplot(x=companies_scatter.revenue,y=companies_scatter.popularity)
plt.show()

 # Dataframe of all movies' revenue and year released
revenue_year = pd.DataFrame([all_mdb.release_date, all_mdb.revenue]).T
revenue_year.loc[:,"revenue"] = revenue_year.loc[:,"revenue"].replace(0,np.nan)
revenue_year = revenue_year.dropna().reset_index(drop=True)
revenue_year.loc[:,"release_date"] = revenue_year.loc[:,"release_date"].str.slice(0,4).astype(int)
revenue_year

# Dataframe of successful movies' revenue and year released
successful_revenue_year = pd.DataFrame([successful_movies.release_date, successful_movies.revenue]).T
successful_revenue_year.loc[:,"revenue"] = successful_revenue_year.loc[:,"revenue"].replace(0,np.nan)
successful_revenue_year = successful_revenue_year.dropna().reset_index(drop=True)
successful_revenue_year.loc[:,"release_date"] = successful_revenue_year.loc[:,"release_date"].str.slice(0,4).astype(int)
successful_revenue_year


fig = plt.figure(figsize=(20,10))

ax0 = fig.add_subplot(1, 2, 1)
ax0 = sns.lineplot(x = revenue_year.release_date, y = np.log10(revenue_year.revenue.astype(float)))
ax0.set_title("Revenue of all movies by year")
ax0.set_xlabel("Year")

ax1 = fig.add_subplot(1, 2, 2) 
ax1 = sns.lineplot(x = successful_revenue_year.release_date, y = np.log10(successful_revenue_year.revenue.astype('float')))
ax1.set_title("Revenue of successful movies by year")
ax1.set_xlabel("Year")

plt.show()

 all_month = all_mdb.release_date.dropna().str.slice(5,7).value_counts().sort_index().to_frame()
all_month.columns = ["Movies Released"]
all_month.index = ["Jan","Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]
all_month.head()

 successful_month = successful_movies.release_date.dropna().str.slice(5,7).value_counts().sort_index().to_frame()
successful_month.columns = ["Movies Released"]
successful_month.index = ["Jan","Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]
successful_month.head()

 fig = plt.figure(figsize=(20,5))

ax0 = fig.add_subplot(1, 2, 1)
ax0 = sns.barplot(x = all_month.index.values.flatten(), y = all_month.values.flatten())
for i in ax0.containers:
    ax0.bar_label(i,)
ax0.set_title("(All movies) Number of movies released by month")
ax0.set_xlabel("Month")
ax0.set_ylabel("Number of movies released")

ax1 = fig.add_subplot(1, 2, 2)
ax1 = sns.barplot(x = successful_month.index.values.flatten(), y = successful_month.values.flatten())
for i in ax1.containers:
    ax1.bar_label(i,)
ax1.set_title("(Successful movies) Number of movies released by month")
ax1.set_xlabel("Month")
ax1.set_ylabel("Number of movies released")

plt.show()

 all_day = pd.to_datetime(all_mdb.release_date.dropna()).apply(lambda t: t.weekday()).value_counts().sort_index().to_frame()
all_day.columns = ["Movies Released"]
all_day.index = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"]
all_day

 successful_day = pd.to_datetime(successful_movies.release_date.dropna()).apply(lambda t: t.weekday()).value_counts().sort_index().to_frame()
successful_day.columns = ["Movies Released"]
successful_day.index = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"]
successful_day

 fig = plt.figure(figsize=(20,10))

ax0 = fig.add_subplot(2, 2, 1)
ax0 = sns.barplot(x = all_day.index.values.flatten(), y = all_day.values.flatten())
for i in ax0.containers:
    ax0.bar_label(i,)
ax0.set_title("(All movies) Number of movies released by weekday")
ax0.set_xlabel("Weekday")
ax0.set_ylabel("Number of movies released")

ax1 = fig.add_subplot(2, 2, 2)
ax1 = sns.barplot(x = successful_day.index.values.flatten(), y = successful_day.values.flatten())
for i in ax1.containers:
    ax1.bar_label(i,)
ax1.set_title("(Successful movies) Number of movies released by weekday")
ax1.set_xlabel("Weekday")
ax1.set_ylabel("Number of movies released")

ax3 = fig.add_subplot(2, 2, 3)
ax3 = sns.barplot(x = all_day.index.values.flatten(), y = (all_day.values.flatten()/all_day.values.sum() * 100).round(decimals=2))
for i in ax3.containers:
    ax3.bar_label(i,)
ax3.set_title("(All movies) Percentage of movies released by weekday")
ax3.set_xlabel("Weekday")
ax3.set_ylabel("Percentage of movies released")

ax4 = fig.add_subplot(2, 2, 4)
ax4 = sns.barplot(x = successful_day.index.values.flatten(), y = (successful_day.values.flatten()/successful_day.values.sum() * 100).round(decimals=2))
for i in ax4.containers:
    ax4.bar_label(i,)
ax4.set_title("(Successful movies) Percentage of movies released by weekday")
ax4.set_xlabel("Weekday")
ax4.set_ylabel("Percentage of movies released")

fig.tight_layout()

plt.show()

 all_date = pd.to_datetime(all_mdb.release_date.dropna()).apply(lambda t: t.day).value_counts().sort_index().to_frame()
all_date.columns = ["Movies Released"]

successful_date = pd.to_datetime(successful_movies.release_date.dropna()).apply(lambda t: t.day).value_counts().sort_index().to_frame()
successful_date.columns = ["Movies Released"]

 fig = plt.figure(figsize=(20,10))

ax0 = fig.add_subplot(2, 2, 1)
ax0 = sns.barplot(x = all_date.index.values.flatten(), y = all_date.values.flatten())
for i in ax0.containers:
    ax0.bar_label(i,)
ax0.set_title("(All movies) Number of movies released by date of month")
ax0.set_xlabel("Date of month")
ax0.set_ylabel("Number of movies released")

ax1 = fig.add_subplot(2, 2, 2)
ax1 = sns.barplot(x = successful_date.index.values.flatten(), y = successful_date.values.flatten())
for i in ax1.containers:
    ax1.bar_label(i,)
ax1.set_title("(Successful movies) Number of movies released by date of month")
ax1.set_xlabel("Date of month")
ax1.set_ylabel("Number of movies released")

ax3 = fig.add_subplot(2, 2, 3)
ax3 = sns.barplot(x = all_date.index.values.flatten(), y = (all_date.values.flatten()/all_date.values.sum() * 100).round(decimals=2))
for i in ax3.containers:
    ax3.bar_label(i,)
ax3.set_title("(All movies) Percentage of movies released by date of month")
ax3.set_xlabel("Date of month")
ax3.set_ylabel("Percentage of movies released")

ax4 = fig.add_subplot(2, 2, 4)
ax4 = sns.barplot(x = successful_date.index.values.flatten(), y = (successful_date.values.flatten()/successful_date.values.sum() * 100).round(decimals=2))
for i in ax4.containers:
    ax4.bar_label(i,)
ax4.set_title("(Successful movies) Percentage of movies released by date of month")
ax4.set_xlabel("Date of month")
ax4.set_ylabel("Percentage of movies released")

fig.tight_layout()

plt.show()

 def popularity(factor, metric): # where factor is "Year", "Revenue". metric is "IMDB rating", "TMDB popularity", 

    nrow = 4
    ncol = 6
    
    fig = plt.figure(figsize=(16,10))
    gs = fig.add_gridspec(nrow, ncol, hspace=0.2, wspace=0)
    axes = gs.subplots(sharex=True, sharey=True)
    
    count = 0

    for r in range(nrow):
        for c in range(ncol):
            
            if (factor == "Year"): 
                a = popularity_revenue.loc[popularity_revenue.genres == popularity_revenue.genres.unique()[count]].dropna(subset=["release_date"])
            
                if (metric == "IMDB rating"):
                    sns.scatterplot(x=a.release_date.astype(int),y=a.averageRating,ax=axes[r,c], alpha = 0.3)
                    axes[r,c].set_ylim(0, 10)
                elif (metric == "TMDB popularity"):
                    sns.scatterplot(x=a.release_date.astype(int),y=a.popularity,ax=axes[r,c], alpha = 0.3)
                    axes[r,c].set_ylim(0, 500) # limit TMDB popularity to 500 because the outliers skew data
                    
            elif (factor == "Revenue"):
                popularity_revenue.loc[:,"revenue"] = popularity_revenue.revenue.replace(0,np.nan)
                a = popularity_revenue.loc[popularity_revenue.genres == popularity_revenue.genres.unique()[count]].dropna(subset=["revenue"])
                a.revenue = np.log10((a.revenue).astype(float))
                a.popularity = a.popularity.astype(float)
                a.averageRating = a.averageRating.astype(float)
                
                if (metric == "IMDB rating"):
                    sns.regplot(x=a.revenue.astype(float),y=a.averageRating,ax=axes[r,c], scatter_kws={'alpha':0.3}, line_kws={'color': 'red'})
                    axes[r,c].set_ylim(0, 10)
                
                elif (metric == "TMDB popularity"): 
                    sns.regplot(x=a.revenue.astype(float),y=a.popularity,ax=axes[r,c], scatter_kws={'alpha':0.3}, line_kws={'color': 'red'})
                    axes[r,c].set_ylim(0, 500)
                
            axes[r,c].set_title(popularity_revenue.genres.unique()[count])
            axes[r,c].set_ylabel("") 
            axes[r,c].set_xlabel("")
            
            count+=1
            if (count == 23):
                break
                
    for ax in axes.flat:
                ax.label_outer()
    

    if (factor == "Year"):
        fig.supxlabel("Release Year")
        fig.supylabel(metric)
        plt.suptitle(metric +" of genres over time")
    elif (factor == "Revenue"):
        fig.supxlabel("Movie Revenue")
        fig.supylabel(metric)
        plt.suptitle(metric + " of genres against revenue")
    plt.show()

 popularity_revenue = pd.DataFrame([all_mdb.popularity, all_mdb.averageRating, all_mdb.numVotes, all_mdb.genres, all_mdb.revenue, all_mdb.release_date]).T.explode("genres").reset_index(drop=True)
popularity_revenue.loc[:,"release_date"] = popularity_revenue.release_date.str.slice(0,4)

popularity("Year", "IMDB rating")

 popularity("Year", "TMDB popularity")

 popularity("Revenue", "IMDB rating")

 popularity("Revenue", "TMDB popularity")

 reg_residual_subplot("Revenue", "Popularity", np.log10(popularity_revenue.revenue.astype(float)), popularity_revenue.popularity.astype(float), (), (0,500))

Pearson's Coefficient is 0.17577145298091046 P-value is 1.756177612781108e-191

 def reg_residual_subplot(x_name, y_name, x_df,y_df, x_lim, y_lim, title):
    # transparent = True or False
    fig = plt.figure(figsize=(20,4))
    ax0 = fig.add_subplot(1, 2, 1)
    
    if (len(x_lim) == 0):
        x_lim = (None, None)
    if (len(y_lim) == 0):
        y_lim = (None, None)

        
    sns.regplot(x=x_df.astype(float), y=y_df.astype(float), scatter_kws={'alpha':0.3}, line_kws={'color': 'red'}, ax=ax0)
    ax0.set_xlim(x_lim)
    ax0.set_ylim(y_lim)
    ax0.set_xlabel(x_name)
    ax0.set_ylabel(y_name)
    ax0.set_title("Linear Regression Plot of " + y_name + " against " + x_name)
    
    ax1 = fig.add_subplot(1, 2, 2)
    
    sns.residplot(x=x_df.astype(float), y=y_df.astype(float), scatter_kws={'alpha':0.3}, ax=ax1)
    ax1.set_xlim(x_lim)
    ax1.set_ylim(y_lim)
    ax1.set_xlabel(x_name)
    ax1.set_ylabel(y_name)
    ax1.set_title("Residual Plot of " + y_name + " against " + x_name)
    
    coef = pd.DataFrame(x_df.astype(float))
    coef.columns = ["x"]
    coef["y"] = y_df.astype(float)
    coef = coef.dropna()
    
    pearson_coef, p_value = stats.pearsonr(coef["x"], coef["y"])
    print("Pearson's Coefficient is " + str(pearson_coef) + " P-value is " + str(p_value))
    
    fig.suptitle(title)
    
    plt.show()

 def barchart_genre_subplot(title,x_name,y_name, df, factor,all_movies): 
    # Factor: "Original Language"
    # all_movies: True, is using all movies dataset, False, is using successful movies dataset
    
    genres = ['Action', 'Adventure', 'Animation', 'Biography', 'Comedy',
       'Crime', 'Drama', 'Family', 'Fantasy', 'Film-Noir', 'History', 'Horror',
       'Music', 'Musical', 'Mystery', 'Romance', 'Sci-Fi',
       'Sport', 'Thriller', 'War', 'Western', 'Reality-TV', 'Adult']
    
    languages = ["English","Hindi","French","Japanese","Spanish","Russian","German","Italian","Danish","Korean", "Chinese"]
    colours = ["tab:blue","tab:orange","tab:green","tab:red","tab:purple","tab:brown","tab:pink","tab:gray","tab:olive","tab:cyan","black"]

    colours = pd.DataFrame(list(zip(languages, colours)), columns =["country","colours"])
     
    nrow = 4
    ncol = 6

    fig = plt.figure(figsize=(20,10))
    gs = fig.add_gridspec(nrow, ncol, hspace=0.2, wspace=0)
    axes = gs.subplots(sharex=True, sharey=True)
    
    count = 0

    for r in range(nrow):
        for c in range(ncol):
            
            if (factor == "Original Language"):
                a = df.loc[(genres[count])].dropna()
                if (title == "Percentage frequency of original language by genre in successful movies"):
                    a = a.loc[a.frequency > 5]
                elif (all_movies == False):
                    a = a.loc[a.frequency > 20]
                else:
                    a = a.head(3)
                if a.frequency.sum() == 0:
                    axes[r,c].set_title(genres[count])
                    axes[r,c].set_ylabel("") 
                    axes[r,c].set_xlabel("")            
                    count+=1
                    continue
                bar_colour = pd.merge(a, colours, on="country", how="left")
                a.T.plot(kind='bar',ax = axes[r,c],color = bar_colour.colours.tolist())

            axes[r,c].set_title(genres[count])
            axes[r,c].set_ylabel("") 
            axes[r,c].set_xlabel("")
            axes[r,c].get_legend().remove()
            
            for bars in axes[r,c].containers:
                axes[r,c].bar_label(bars)
            
            count+=1
            if (count == len(df.index.get_level_values(0).unique())):
                break
                
    for ax in axes.flat:
        ax.label_outer()

    lines = []
    labels = []
  
    for ax in fig.axes:
        Line, Label = ax.get_legend_handles_labels()
        lines.extend(Line)
        labels.extend(Label)
  
    labels_df = pd.DataFrame(list(zip(lines,labels)), columns =["lines","labels"])
    labels_df = labels_df.drop_duplicates(subset = "labels") 
    fig.legend(labels_df.lines, labels_df.labels, loc='upper right')
    
    fig.supxlabel(x_name)
    fig.supylabel(y_name)
    plt.suptitle(title)
        
    plt.show()

 def wordcloud_subplot(df, title):
    nrow = 4
    ncol = 6
    
    genres = ['Action', 'Adventure', 'Animation', 'Biography', 'Comedy',
       'Crime', 'Drama', 'Family', 'Fantasy', 'Film-Noir', 'History', 'Horror',
       'Music', 'Musical', 'Mystery', 'Romance', 'Sci-Fi',
       'Sport', 'Thriller', 'War', 'Western', 'Reality-TV', 'Adult']
    
    fig = plt.figure(figsize=(16,10))
    gs = fig.add_gridspec(nrow, ncol, hspace=0.2, wspace=0)
    axes = gs.subplots(sharex=True, sharey=True)
    
    count = 0

    for r in range(nrow):
        for c in range(ncol):
            
            genre = genres[count]
            wc = WordCloud(background_color="white")
            wc.generate_from_frequencies(df.loc[genre].sort_values(by="frequency",ascending=False).head(10).to_dict()["frequency"])
 
            axes[r,c].imshow(wc, interpolation="bilinear")
            axes[r,c].axis("off")
                
            axes[r,c].set_title(genre)
            
            count+=1
            if (count == len(df.index.get_level_values(0).unique())):
                axes[r,c].axis("off")
                break
                
    for ax in axes.flat:
                ax.label_outer()
    
    plt.suptitle(title)
    
    plt.show()

 def VAR(job, title): # job = writers or directors
    profession = job
    
    job = all_mdb.loc[:,["imdb_id", job, "budget", "revenue", "startYear"]].explode(job)
    job.columns = ["imdb_id", "nconst", "budget", "revenue", "startYear"]
    job.budget = job.budget.replace(0,np.nan)
    job = job.dropna(subset=["budget"])
    job["Net Profit"] = job["revenue"] - job["budget"]
    
    job = pd.merge(job, imdb_names, on="nconst").loc[:,["imdb_id","nconst", "primaryName", "budget","revenue","Net Profit", "startYear", "deathYear"]]
    job = job.loc[(job.startYear >= 2000) & (job.deathYear.isna())].loc[:,:"startYear"].reset_index(drop=True)
    
    job_count = job.groupby(['primaryName'],  as_index=False)["Net Profit"].count()
    job = job.loc[job.isin(job_count.loc[job_count["Net Profit"] > 10].primaryName.values).primaryName].reset_index(drop=True)

    job_ranked = job.groupby(['primaryName'],  as_index=False)["Net Profit"].mean().sort_values(by='Net Profit', ascending=False).reset_index(drop=True)
    job_ranked['VAR'] = (job_ranked['Net Profit']/job_ranked['Net Profit'].mean())
    
    # graph
    plt.figure(figsize=(20,10))

    ax = sns.barplot(x=(job_ranked.VAR.head(20).astype(float)).round(decimals=2), y=job_ranked.primaryName.head(20),orient="h")
    for i in ax.containers:
        ax.bar_label(i,)

    plt.ylabel("Name of " + profession)
    plt.xlabel("VAR")
    plt.title(title)
    
    return job_ranked

 def popularity(factor, metric, title): # where factor is "Year", "Revenue". metric is "IMDB rating", "TMDB popularity", 

    nrow = 4
    ncol = 6
    
    fig = plt.figure(figsize=(16,10))
    gs = fig.add_gridspec(nrow, ncol, hspace=0.2, wspace=0)
    axes = gs.subplots(sharex=True, sharey=True)
    
    count = 0

    for r in range(nrow):
        for c in range(ncol):
            
            if (factor == "Year"): 
                a = popularity_revenue.loc[popularity_revenue.genres == popularity_revenue.genres.unique()[count]].dropna(subset=["release_date"])
            
                if (metric == "IMDB rating"):
                    sns.scatterplot(x=a.release_date.astype(int),y=a.averageRating,ax=axes[r,c], alpha = 0.3)
                    axes[r,c].set_ylim(0, 10)
                elif (metric == "TMDB popularity"):
                    sns.scatterplot(x=a.release_date.astype(int),y=a.popularity,ax=axes[r,c], alpha = 0.3)
                    axes[r,c].set_ylim(0, 500) # limit TMDB popularity to 500 because the outliers skew data
                    
            elif (factor == "Revenue"):
                popularity_revenue.loc[:,"revenue"] = popularity_revenue.revenue.replace(0,np.nan)
                a = popularity_revenue.loc[popularity_revenue.genres == popularity_revenue.genres.unique()[count]].dropna(subset=["revenue"])
                a.revenue = np.log10((a.revenue).astype(float))
                a.popularity = a.popularity.astype(float)
                a.averageRating = a.averageRating.astype(float)
                
                if (metric == "IMDB rating"):
                    sns.regplot(x=a.revenue.astype(float),y=a.averageRating,ax=axes[r,c], scatter_kws={'alpha':0.3}, line_kws={'color': 'red'})
                    axes[r,c].set_ylim(0, 10)
                
                elif (metric == "TMDB popularity"): 
                    sns.regplot(x=a.revenue.astype(float),y=a.popularity,ax=axes[r,c], scatter_kws={'alpha':0.3}, line_kws={'color': 'red'})
                    axes[r,c].set_ylim(0, 500)
                
            axes[r,c].set_title(popularity_revenue.genres.unique()[count])
            axes[r,c].set_ylabel("") 
            axes[r,c].set_xlabel("")
            
            count+=1
            if (count == 23):
                break
                
    for ax in axes.flat:
                ax.label_outer()
    

    if (factor == "Year"):
        fig.supxlabel("Release Year")
        fig.supylabel(metric)
        plt.suptitle("Popularity of genres over time")
    elif (factor == "Revenue"):
        fig.supxlabel("Movie Revenue")
        fig.supylabel("Popularity")
        plt.suptitle(title)
    plt.show()

 fig = plt.figure(figsize=(15,12))
ax0 = fig.add_subplot(1, 2, 1)

ax0 = sns.barplot(y=genres_percentage.index.values ,x=genres_percentage.values, orient='h')
for i in ax0.containers:
    ax0.bar_label(i,)
ax0.set_title("Percentage of frequency of genres in all movies")
ax0.set_xlabel("Percentage of frequency (%)")
ax0.set_ylabel("Genres")

ax1 = fig.add_subplot(1, 2, 2) 
ax1 = sns.barplot(y=successful_genres_percentage.index.values ,x=successful_genres_percentage.genres, orient='h')
for i in ax1.containers:
    ax1.bar_label(i,)
ax1.set_title("Percentage of successful movies for each genre")
ax1.set_xlabel("Percentage of successful movies (%)")
ax1.set_ylabel("Genres")

fig.suptitle("Genres with more movies do not correlate with success. \nAdventure, Family, Animation, Sci-Fi and Horror have the highest chance of being successful.")
plt.show()

 plt.figure(figsize=(20,7)) 
sns.pointplot( y=np.log10(genre_mean.values.astype('float')), x=genre_mean.index.values)
plt.title("Adventure, Animation, Sci-Fi, Action, Fantasy have the highest average revenue.")
plt.xlabel("Genres")
plt.ylabel("Revenue (by powers of 10)")
plt.show()

median_order = genres.groupby(by=["genres"])["revenue"].median().sort_values(ascending=False).index.tolist()
median_order.remove("Adult")
plt.figure(figsize=(20,7)) 
sns.boxplot( y=np.log10(genres["revenue"].astype('float')), x=genres["genres"], order = median_order)
plt.xlabel("Genres")
plt.ylabel("Revenue (by powers of 10)")
plt.title("Adventure, Animation, Family, Action, Sci-Fi have the highest median revenue.")
plt.show()

 reg_residual_subplot("Budget", "Revenue", all_mdb.budget, all_mdb.revenue, (), (), "There is a strong positive correlation between budget and revenue.")

Pearson's Coefficient is 0.7467009218280032 P-value is 0.0

 reg_residual_subplot("Runtime", "Revenue", all_mdb.runtimeMinutes, all_mdb.revenue, (0,300), (0,1.0e9),"There is a weak positive correlation between budget and revenue.")

Pearson's Coefficient is 0.17891001036238946 P-value is 1.292560240450447e-79

 barchart_genre_subplot("Percentage frequency of original language by genre in all movies", "Original Languages", "Percentage frequency of language", percent_all_original_lang, "Original Language", True)

 barchart_genre_subplot("Percentage frequency of original language by genre in successful movies", "Original Languages", "Percentage frequency of language", percent_successful_original_lang, "Original Language", False)

 top_10_original_lang = all_mdb.loc[all_mdb.isin(all_mdb.groupby("original_language")["revenue"].median().sort_values(ascending=False).head(13).index.values).original_language]

# Change country codes to language name
language_codes.columns = ["original_language", "country"]

top_10_original_lang = pd.merge(top_10_original_lang, language_codes, how = "left", on="original_language")

# sort boxplots by descending median, remove the outliers
median_order = top_10_original_lang.groupby("country")["revenue"].median().sort_values(ascending=False).head(13).index.tolist()
median_order.remove("Abkhazian")
median_order.remove("Afrikaans")
median_order.remove("Malay")

plt.figure(figsize=(15,7))
sns.boxplot(x=top_10_original_lang.country, y=np.log10(top_10_original_lang.revenue).astype(float), order = median_order)
plt.xticks(rotation=90)
plt.xlabel("Original Language")
plt.title("Chinese, followed by English, is the original language with the highest median.")

plt.show()

 # sort boxplots by descending median
filtered_spoken_languages = spoken_languages.loc[spoken_languages.isin(spoken_languages.groupby("country")["revenue"].count().loc[spoken_languages.groupby("country")["revenue"].count() >= 100].index).country]
median_order = filtered_spoken_languages.groupby("country")["revenue"].median().sort_values(ascending=False).index.tolist()

plt.figure(figsize=(15,7))
sns.boxplot(x=filtered_spoken_languages.country, y=np.log10(filtered_spoken_languages.revenue).astype(float), order = median_order)
plt.xticks(rotation=90)
plt.xlabel("Spoken Language")
plt.title("Chinese-speaking movies have the highest median revenue.")
plt.show()

 good_keywords = successful_keywords.keywords.value_counts().loc[successful_keywords.keywords.value_counts() >= 50].index.tolist()
successful_keywords_revenue = successful_movies.loc[:,["revenue","keywords"]].explode("keywords").dropna().groupby("keywords").revenue.mean().sort_values(ascending=False).to_frame().reset_index()
successful_keywords_revenue = successful_keywords_revenue.loc[successful_keywords_revenue .isin(good_keywords).keywords].head(10).reset_index(drop=True)
successful_keywords_revenue

 wordcloud_subplot(successful_genres_keywords, "Word Clouds of popular keywords by genre in successful movies")

 fig = plt.figure(figsize=(20,10))

ax0 = fig.add_subplot(1, 2, 1)
ax0 = sns.barplot(x=(actor_ranked.VAR.head(20).astype(float)).round(decimals=2), y=actor_ranked.primaryName.head(20),orient="h")
for i in ax0.containers:
    ax0.bar_label(i,)
ax0.set_ylabel("")
ax0.set_xlabel("")
ax0.set_title("Actors")
    
ax1 = fig.add_subplot(1, 2, 2)
ax1 = sns.barplot(x=(actress_ranked.VAR.head(20).astype(float)).round(decimals=2), y=actress_ranked.primaryName.head(20),orient="h")
for i in ax1.containers:
    ax1.bar_label(i,)
ax1.set_ylabel("")
ax0.set_xlabel("")
ax1.set_title("Actresses")

plt.suptitle("VAR of top 20 actors by gender")
fig.supylabel("Name of actor")
fig.supxlabel("VAR")
plt.show()

 directors_ranked = VAR("directors", "VAR of top 20 directors")

 writers_ranked = VAR("writers", "VAR of top 20 writers")

 fig = plt.figure(figsize=(15,7))

ax0 = fig.add_subplot(1, 2, 1)
ax0 = sns.boxplot(x = job_ranked.Job, y = job_ranked.VAR)

ax1 = fig.add_subplot(1, 2, 2)
ax1 = sns.violinplot(x = job_ranked.Job, y = job_ranked.VAR)

fig.suptitle("Generally, Directors bring the most to a movie's success, as they have the highest median VAR.")
plt.show()

 #plt.figure(figsize=(15,15))
fig = plt.figure(figsize=(15,15))
# plotting scatter plot
sc = sns.regplot(x=companies_scatter.revenue,y=companies_scatter.popularity)

# Loop for annotation of all points
for i in range(len(companies_scatter.revenue)):
    plt.annotate(companies_scatter.production_companies[i], (companies_scatter.revenue[i] + 0.4, companies_scatter.popularity[i] + 0.4))

pearson_coef, p_value = stats.pearsonr(companies_scatter.revenue, companies_scatter.popularity)
print("Pearson's Coefficient is " + str(pearson_coef) + " P-value is " + str(p_value))
plt.show()

Pearson's Coefficient is 0.8327430006416182 P-value is 2.415699718774758e-07

 fig = plt.figure(figsize=(20,5))

ax0 = fig.add_subplot(1, 2, 1)
ax0 = sns.barplot(x = all_month.index.values.flatten(), y = all_month.values.flatten())
for i in ax0.containers:
    ax0.bar_label(i,)
ax0.set_title("(All movies) Number of movies released by month")
ax0.set_xlabel("Month")
ax0.set_ylabel("Number of movies released")

ax1 = fig.add_subplot(1, 2, 2)
ax1 = sns.barplot(x = successful_month.index.values.flatten(), y = successful_month.values.flatten())
for i in ax1.containers:
    ax1.bar_label(i,)
ax1.set_title("(Successful movies) Number of movies released by month")
ax1.set_xlabel("Month")
ax1.set_ylabel("Number of movies released")

plt.show()

 fig = plt.figure(figsize=(15,7))

ax3 = fig.add_subplot(1, 2, 1)
ax3 = sns.barplot(x = all_day.index.values.flatten(), y = (all_day.values.flatten()/all_day.values.sum() * 100).round(decimals=2))
for i in ax3.containers:
    ax3.bar_label(i,)
ax3.set_title("(All movies) Percentage of movies released by weekday")
ax3.set_xlabel("Weekday")
ax3.set_ylabel("Percentage of movies released")

ax4 = fig.add_subplot(1,2,2)
ax4 = sns.barplot(x = successful_day.index.values.flatten(), y = (successful_day.values.flatten()/successful_day.values.sum() * 100).round(decimals=2))
for i in ax4.containers:
    ax4.bar_label(i,)
ax4.set_title("(Successful movies) Percentage of movies released by weekday")
ax4.set_xlabel("Weekday")
ax4.set_ylabel("Percentage of movies released")

fig.suptitle("Friday is clearly the best day to release a movie.")
plt.show()

 fig = plt.figure(figsize=(20,5))

ax3 = fig.add_subplot(1, 2, 1)
ax3 = sns.barplot(x = all_date.index.values.flatten(), y = (all_date.values.flatten()/all_date.values.sum() * 100).round(decimals=2))
for i in ax3.containers:
    ax3.bar_label(i,)
ax3.set_title("(All movies) Percentage of movies released by date of month")
ax3.set_xlabel("Date of month")
ax3.set_ylabel("Percentage of movies released")

ax4 = fig.add_subplot(1,2,2)
ax4 = sns.barplot(x = successful_date.index.values.flatten(), y = (successful_date.values.flatten()/successful_date.values.sum() * 100).round(decimals=2))
for i in ax4.containers:
    ax4.bar_label(i,)
ax4.set_title("(Successful movies) Percentage of movies released by date of month")
ax4.set_xlabel("Date of month")
ax4.set_ylabel("Percentage of movies released")

plt.suptitle("Release your movie on the 1st or 25th of the month.")

plt.show()

 popularity("Revenue", "TMDB popularity", "There is a positive correlation between revenue and popularity.")

 reg_residual_subplot("Revenue", "Popularity", np.log10(popularity_revenue.revenue.astype(float)), popularity_revenue.popularity.astype(float), (), (0,500),"")

Pearson's Coefficient is 0.17577145298091046 P-value is 1.756177612781108e-191

	tconst	directors	writers
0	tt0000001	nm0005690	NaN
1	tt0000002	nm0721526	NaN
2	tt0000003	nm0721526	NaN
3	tt0000004	nm0721526	NaN
4	tt0000005	nm0005690	NaN

	tconst	averageRating	numVotes
0	tt0000001	5.7	1903
1	tt0000002	5.9	255
2	tt0000003	6.5	1698
3	tt0000004	5.7	167
4	tt0000005	6.2	2518

	nconst	primaryName	birthYear	deathYear	primaryProfession	knownForTitles
0	nm0000001	Fred Astaire	1899.0	1987.0	soundtrack,actor,miscellaneous	tt0031983,tt0050419,tt0053137,tt0072308
1	nm0000002	Lauren Bacall	1924.0	2014.0	actress,soundtrack	tt0038355,tt0071877,tt0117057,tt0037382
2	nm0000003	Brigitte Bardot	1934.0	NaN	actress,soundtrack,music_department	tt0054452,tt0049189,tt0057345,tt0056404
3	nm0000004	John Belushi	1949.0	1982.0	actor,soundtrack,writer	tt0077975,tt0072562,tt0078723,tt0080455
4	nm0000005	Ingmar Bergman	1918.0	2007.0	writer,director,actor	tt0050976,tt0083922,tt0060827,tt0050986

	tconst	ordering	nconst	category	job	characters
0	tt0000001	1	nm1588970	self	NaN	["Self"]
1	tt0000001	2	nm0005690	director	NaN	NaN
2	tt0000001	3	nm0374658	cinematographer	director of photography	NaN
3	tt0000002	1	nm0721526	director	NaN	NaN
4	tt0000002	2	nm1335271	composer	NaN	NaN

	0
aa	Afar
ab	Abkhazian
af	Afrikaans
am	Amharic
ar	Arabic
...	...
zh-hk	Chinese (Hong Kong SAR)
zh-mo	Chinese (Macau SAR)
zh-sg	Chinese (Singapore)
zh-tw	Chinese (Taiwan)
zu	Zulu

What makes a movie successful? by [Kai Wen]¶

Table of Content (with relevant hyperlinks to sections)¶

Motivation and Background ¶

Summary of Research Questions & Results ¶

Dataset ¶

Methodology ¶

Data Acquisition ¶

Data Cleaning ¶

EDA ¶

Research Question 1: What are the best genres for a movie?¶

Research Question 2: What other factors affect a movie's revenue?¶

Research Question 3: Who should you hire to produce a movie?¶

Research Question 4: When should one release a movie?¶

Research Question 5: Does popularity correlate with revenue?¶

Results Findings & Conclusion ¶

Research Question 1: What are the best genres for a movie?¶

Research Question 2: What other factors affect a movie's revenue?¶

Research Question 3: Who should you hire to produce a movie? ¶

Research Question 4: When should one release a movie?¶

Research Question 5: Does popularity correlate with revenue?¶

Recommendations or Further Works ¶

References ¶

	tconst	titleType	primaryTitle	originalTitle	startYear	endYear	runtimeMinutes	genres
0	tt0000001	short	Carmencita	Carmencita	1894.0	NaN	1.0	Documentary,Short
1	tt0000002	short	Le clown et ses chiens	Le clown et ses chiens	1892.0	NaN	5.0	Animation,Short
2	tt0000003	short	Pauvre Pierrot	Pauvre Pierrot	1892.0	NaN	4.0	Animation,Comedy,Romance
3	tt0000004	short	Un bon bock	Un bon bock	1892.0	NaN	12.0	Animation,Short
4	tt0000005	short	Blacksmith Scene	Blacksmith Scene	1893.0	NaN	1.0	Comedy,Short

	imdb_id	status	budget	revenue	release_date	original_language	popularity	production_companies	production_countries	spoken_languages	keywords
0	tt0002130	Released	0.0	0.0	1911-07-01	it	6.472	[[['Milano Films'], ['']]]	['IT']	[]	['hell', 'afterlife', 'purgatory', 'devil', 'd...
1	tt0002844	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
2	tt0003014	Released	0.0	0.0	1913-10-27	sv	3.308	[[['Svenska Biografteatern'], ['SE']]]	['SE']	[]	['foster parents', 'widow', 'run away', 'reuni...
3	tt0003037	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
4	tt0003165	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
...	...	...	...	...	...	...	...	...	...	...	...
35930	tt3706352	Released	0.0	0.0	2022-04-08	en	162.576	[[['Nick Wechsler Productions'], ['US']], [['C...	['US']	['en']	['spy', 'cia', 'based on novel or book', 'cons...
35931	tt6710474	Released	25000000.0	97000000.0	2022-03-24	en	190.306	[[['A24'], ['US']], [['AGBO'], ['US']], [['Ley...	['US']	['cn', 'zh', 'en']	['mother', 'philosophy', 'martial arts', 'kung...
35932	tt8323668	Released	0.0	0.0	2022-06-15	en	79.847	[[['Netter Productions'], ['US']], [['Levantin...	['US']	['en']	['michigan', 'small town', 'massachusetts', 'h...
35933	tt15560626	Released	0.0	0.0	2022-01-21	ml	3.905	[[['Shane Nigam Films'], ['IN']], [['Plan T Fi...	['IN']	['ml']	[]
35934	tt11680472	Released	0.0	0.0	2022-06-17	pa	0.600	[[['Humble Motion Pictures'], ['']]]	['IN']	['pa']	[]

	tconst	primaryTitle	originalTitle	isAdult	startYear	runtimeMinutes	genres
0	tt0000009	Miss Jerry	Miss Jerry	0.0	1894.0	45.0	Romance
1	tt0000502	Bohemios	Bohemios	0.0	1905.0	100.0	NaN
2	tt0000574	The Story of the Kelly Gang	The Story of the Kelly Gang	0.0	1906.0	70.0	Action,Adventure,Biography
3	tt0000591	The Prodigal Son	L'enfant prodigue	0.0	1907.0	90.0	Drama
4	tt0000615	Robbery Under Arms	Robbery Under Arms	0.0	1907.0	NaN	Drama
...	...	...	...	...	...	...	...
617614	tt9916622	Rodolpho Teóphilo - O Legado de um Pioneiro	Rodolpho Teóphilo - O Legado de um Pioneiro	0.0	2015.0	57	Documentary
617615	tt9916680	De la ilusión al desconcierto: cine colombiano...	De la ilusión al desconcierto: cine colombiano...	0.0	2007.0	100	Documentary
617616	tt9916706	Dankyavar Danka	Dankyavar Danka	0.0	2013.0	NaN	Comedy
617617	tt9916730	6 Gunn	6 Gunn	0.0	2017.0	116	NaN
617618	tt9916754	Chico Albuquerque - Revelações	Chico Albuquerque - Revelações	0.0	2013.0	49	Documentary

	isAdult	startYear	runtimeMinutes	averageRating	numVotes
count	246764.000000	246764.000000	246764.000000	246764.000000	2.467640e+05
mean	0.015436	1993.997208	94.364210	6.142329	3.994537e+03
std	0.123279	25.675571	143.554644	1.322219	3.643640e+04
min	0.000000	1894.000000	1.000000	1.000000	5.000000e+00
25%	0.000000	1977.000000	81.000000	5.400000	2.200000e+01
50%	0.000000	2004.000000	91.000000	6.300000	7.700000e+01
75%	0.000000	2014.000000	103.000000	7.000000	3.870000e+02
max	1.000000	2022.000000	51420.000000	10.000000	2.620772e+06

	tconst	primaryTitle	originalTitle	isAdult	startYear	runtimeMinutes	genres	averageRating	numVotes	directors	writers
0	tt0002130	Dante's Inferno	L'Inferno	0.0	1911.0	71.0	Adventure,Drama,Fantasy	7.0	2888.0	nm0078205,nm0209738,nm0655824	nm0019604
1	tt0002844	Fantômas: In the Shadow of the Guillotine	Fantômas - À l'ombre de la guillotine	0.0	1913.0	54.0	Crime,Drama	6.9	2271.0	nm0275421	nm0019855,nm0275421,nm0816232
2	tt0003014	Ingeborg Holm	Ingeborg Holm	0.0	1913.0	96.0	Drama	7.0	1226.0	nm0803705	nm0472236,nm0803705
3	tt0003037	Fantomas: The Man in Black	Juve contre Fantômas	0.0	1913.0	61.0	Crime,Drama	6.9	1568.0	nm0275421	nm0019855,nm0275421,nm0816232
4	tt0003165	Fantômas: The Dead Man Who Killed	Le mort qui tue	0.0	1913.0	90.0	Crime,Drama,Mystery	6.9	1233.0	nm0275421	nm0019855,nm0275421,nm0816232
...	...	...	...	...	...	...	...	...	...	...	...
35930	tt11497716	The Legend	The Legend	0.0	2022.0	161.0	Action,Drama,Romance	2.6	1311.0	nm1504071,nm11226131	nm1474787
35931	tt21026586	Karan Arjun	Karan Arjun	0.0	2022.0	106.0	Action,Thriller	7.9	1860.0	nm13757693	nm13746589
35932	tt8130518	Dark Cloud	Dark Cloud	0.0	2022.0	81.0	Horror,Sci-Fi,Thriller	4.6	4211.0	nm5133487	nm7389445
35933	tt10160886	Sneakerella	Sneakerella	0.0	2022.0	112.0	Comedy,Family,Musical	4.5	1331.0	nm0020491	nm4686553,nm0330719,nm2922750,nm1760850,nm2309894
35934	tt11680472	Posti	Posti	0.0	2022.0	126.0	Comedy,Drama	9.2	1174.0	nm2378766	nm2378766

	imdb_id	primaryTitle	originalTitle	startYear	runtimeMinutes	genres	averageRating	numVotes	directors	...	status	budget	revenue	release_date	original_language	popularity	production_companies	production_countries	spoken_languages	keywords
0	tt0005078	The Cheat	The Cheat	1915.0	59.0	[Drama, Romance]	6.5	2572.0	[nm0001124]	...	Released	17311.0	137365.0	1915-12-13	en	2.375	[[[Paramount], [US]], [[Jesse L. Lasky Feature...	[US]	NaN	NaN
1	tt0004972	The Birth of a Nation	The Birth of a Nation	1915.0	195.0	[Drama, History, War]	6.2	24669.0	[nm0000428]	...	Released	100000.0	11000000.0	1915-02-08	en	11.033	[[[Epoch Film Co.], []]]	[US]	NaN	[ku klux klan, southern usa, army, racist, sil...
2	tt0006333	20,000 Leagues Under the Sea	20,000 Leagues Under the Sea	1916.0	85.0	[Action, Adventure, Sci-Fi]	6.1	1802.0	[nm0665737]	...	Released	200000.0	8000000.0	1916-12-24	en	2.897	[[[The Universal Film Mfg. Co.], []]]	[BS, US]	[en]	[revenge, captain nemo]
3	tt0009369	Mickey	Mickey	1918.0	93.0	[Comedy, Drama]	6.1	1124.0	[nm0428059, nm0949648]	...	Released	250000.0	8000000.0	1918-08-01	en	1.098	[[[Mabel Normand Feature Film Company], []]]	[US]	[en]	[new york city, horse race, fish out of water,...
4	tt0012349	The Kid	The Kid	1921.0	68.0	[Comedy, Drama, Family]	8.3	125518.0	[nm0000122]	...	Released	250000.0	2500000.0	1921-01-21	en	11.362	[[[Charles Chaplin Productions], []]]	[US]	NaN	[angel, suicide attempt, fistfight, slapstick ...

	isAdult	startYear	runtimeMinutes	averageRating	numVotes	budget	revenue	popularity
count	3216.000000	3216.000000	3216.000000	3216.000000	3.216000e+03	3.216000e+03	3.216000e+03	3216.000000
mean	0.000933	1998.135572	112.077425	6.749627	1.614920e+05	2.918425e+07	1.492110e+08	41.580716
std	0.030533	19.711077	23.025145	0.999630	2.384920e+05	4.544593e+07	2.350687e+08	124.345361
min	0.000000	1915.000000	59.000000	1.300000	1.000000e+03	1.000000e+00	1.400000e+01	0.600000
25%	0.000000	1988.000000	96.000000	6.200000	2.066275e+04	3.000000e+06	1.902573e+07	10.142000
50%	0.000000	2004.000000	108.000000	6.800000	7.690500e+04	1.100000e+07	6.516250e+07	18.334500
75%	0.000000	2013.000000	124.000000	7.500000	2.002195e+05	3.325000e+07	1.737172e+08	39.056750
max	1.000000	2022.000000	321.000000	9.400000	2.592594e+06	3.800000e+08	2.847246e+09	4156.973000

	isAdult	startYear	runtimeMinutes	averageRating	numVotes	budget	revenue	popularity
count	2948.0	2948.000000	2948.000000	2948.000000	2.948000e+03	2.948000e+03	2.948000e+03	2948.000000
mean	0.0	1999.452849	113.193012	6.753290	1.731187e+05	3.181980e+07	1.618810e+08	44.515838
std	0.0	18.062896	22.633454	0.980159	2.446566e+05	4.658079e+07	2.414239e+08	129.421131
min	0.0	1925.000000	68.000000	1.300000	1.000000e+03	5.000000e+05	1.277800e+06	0.722000
25%	0.0	1990.000000	97.000000	6.200000	2.712625e+04	4.000000e+06	2.624780e+07	11.215500
50%	0.0	2005.000000	108.500000	6.800000	8.805100e+04	1.300000e+07	7.627828e+07	19.938000
75%	0.0	2013.000000	125.000000	7.500000	2.157662e+05	3.762500e+07	1.875960e+08	42.200750
max	0.0	2022.000000	321.000000	9.200000	2.592594e+06	3.800000e+08	2.847246e+09	4156.973000

	imdb_id	primaryTitle	originalTitle	isAdult	startYear	runtimeMinutes	genres	averageRating	numVotes	directors	...	status	budget	revenue	release_date	original_language	popularity	production_companies	production_countries	spoken_languages	keywords
0	tt0015864	The Gold Rush	The Gold Rush	0.0	1925.0	95.0	[Adventure, Comedy, Drama]	8.2	110578.0	[nm0000122]	...	Released	923000.0	4000000.0	1925-07-12	en	13.315	[[[United Artists], [US]], [[Charles Chaplin P...	[US]	NaN	[river, gold, dance, worker, cabin, gold rush,...
1	tt0019344	Sadie Thompson	Sadie Thompson	0.0	1928.0	97.0	[Drama]	7.2	2708.0	[nm0909825]	...	Released	1000000.0	7000000.0	1928-01-07	en	0.954	[[[Gloria Swanson Pictures], []]]	[US]	NaN	[suicide, south seas, missionary, religion, si...
2	tt0019777	The Cocoanuts	The Cocoanuts	0.0	1929.0	96.0	[Adventure, Comedy, Musical]	6.8	7717.0	[nm0282984, nm0763798]	...	Released	500000.0	1800000.0	1929-05-23	en	4.499	[[[Paramount], [US]]]	[US]	[en]	[musical, jewel thief, pre-code, real estate]
3	tt0023196	Mata Hari	Mata Hari	0.0	1931.0	89.0	[Crime, Drama, Romance]	6.6	3724.0	[nm0280463]	...	Released	558000.0	2227000.0	1931-12-26	en	5.058	[[[Metro-Goldwyn-Mayer], [US]]]	[US]	[en]	[spy, world war i, espionage, female spy, pre-...
4	tt0021749	City Lights	City Lights	0.0	1931.0	87.0	[Comedy, Drama, Romance]	8.5	183052.0	[nm0000122]	...	Released	1500000.0	4250000.0	1931-02-01	en	14.868	[[[United Artists], [US]], [[Charles Chaplin P...	[US]	NaN	[suicide attempt, operation, blindness and imp...
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
2943	tt1877830	The Batman	The Batman	0.0	2022.0	176.0	[Action, Crime, Drama]	7.9	561381.0	[nm0716257]	...	Released	185000000.0	770836163.0	2022-03-01	en	860.769	[[[Warner Bros. Pictures], [US]], [[6th & Idah...	[US]	[en]	[crime fighter, secret identity, nightclub, po...
2944	tt10698680	K.G.F: Chapter 2	K.G.F: Chapter 2	0.0	2022.0	168.0	[Action, Crime, Drama]	8.4	115241.0	[nm6073824]	...	Released	13000000.0	90410749.0	2022-04-14	kn	13.101	[[[Excel Entertainment], [IN]], [[Vaaraahi Cha...	[IN]	[kn]	[undercover, sequel, one man army, don, action...
2945	tt9419884	Doctor Strange in the Multiverse of Madness	Doctor Strange in the Multiverse of Madness	0.0	2022.0	126.0	[Action, Adventure, Fantasy]	7.0	340344.0	[nm0000600]	...	Released	200000001.0	955200000.0	2022-05-04	en	1310.241	[[[Marvel Studios], [US]], [[Kevin Feige Produ...	[US]	[cn, en, es]	[magic, superhero, based on comic, alternate r...
2946	tt8115900	The Bad Guys	The Bad Guys	0.0	2022.0	100.0	[Adventure, Animation, Comedy]	6.8	29665.0	[nm2010048]	...	Released	80000000.0	245000000.0	2022-03-17	en	459.358	[[[Universal Pictures], [US]], [[DreamWorks An...	[US]	[en, da]	[based on graphic novel, duringcreditsstinger]
2947	tt10648342	Thor: Love and Thunder	Thor: Love and Thunder	0.0	2022.0	118.0	[Action, Adventure, Comedy]	6.7	150494.0	[nm0169806]	...	Released	250000000.0	758004063.0	2022-07-06	en	4156.973	[[[Marvel Studios], [US]], [[Kevin Feige Produ...	[US]	[en]	[ex-girlfriend, hero, greek mythology, sequel,...

	revenue	genres
0	4000000.0	Adventure
1	4000000.0	Comedy
2	4000000.0	Drama
3	7000000.0	Drama
4	1800000.0	Adventure

	revenue	spoken_languages	country
0	8000000.0	en	English
1	8000000.0	en	English
2	274827.0	en	English
3	1000000.0	en	English
4	2600000.0	en	English
...	...	...	...
16034	3900000.0	en	English
16035	3900000.0	ta	Tamil
16036	1564696.0	en	English
16037	15400000.0	en	English
16038	15400000.0	ro	Romanian

	keywords
based on novel or book	1965
woman director	1885
murder	1632
revenge	871
based on true story	719
new york city	697
biography	691
love	688
musical	586
sequel	580
police	556
friendship	547
parent child relationship	501

	keywords
based on novel or book	1.09
woman director	1.04
murder	0.90
revenge	0.48
based on true story	0.40
new york city	0.39
biography	0.38
love	0.38
musical	0.32
sequel	0.32
police	0.31
friendship	0.30
parent child relationship	0.28

	keywords
based on novel or book	323
duringcreditsstinger	212
sequel	171
murder	164
new york city	145
aftercreditsstinger	132
woman director	122
revenge	120
based on true story	109
friendship	107
parent child relationship	101

	keywords
based on novel or book	1.05
duringcreditsstinger	0.69
sequel	0.55
murder	0.53
new york city	0.47
aftercreditsstinger	0.43
woman director	0.40
revenge	0.39
based on true story	0.35
friendship	0.35
parent child relationship	0.33

	keywords	revenue
0	superhero	6.404721e+08
1	based on comic	6.013988e+08
2	magic	4.993211e+08
3	aftercreditsstinger	4.244748e+08
4	based on young adult novel	4.230694e+08
5	sequel	3.668738e+08
6	duringcreditsstinger	3.455082e+08
7	alien	3.204866e+08
8	spy	2.801209e+08
9	dystopia	2.447500e+08

		frequency
genres	keywords
Action	martial arts	442
	revenge	381
	based on novel or book	287
	murder	284
	sequel	261

What makes a movie successful? by [Kai Wen]¶

Table of Content (with relevant hyperlinks to sections)¶

Motivation and Background¶

Summary of Research Questions & Results¶

Dataset¶

Methodology¶

Data Acquisition ¶

Data Cleaning¶

EDA¶

Research Question 1: What are the best genres for a movie?¶

Research Question 2: What other factors affect a movie's revenue?¶

Research Question 3: Who should you hire to produce a movie?¶

Research Question 4: When should one release a movie?¶

Research Question 5: Does popularity correlate with revenue?¶

Results Findings & Conclusion¶

Research Question 1: What are the best genres for a movie?¶

Research Question 2: What other factors affect a movie's revenue?¶

Research Question 3: Who should you hire to produce a movie? ¶

Research Question 4: When should one release a movie?¶

Research Question 5: Does popularity correlate with revenue?¶

Recommendations or Further Works¶

References¶

Motivation and Background ¶

Summary of Research Questions & Results ¶

Dataset ¶

Methodology ¶

Data Cleaning ¶

EDA ¶

Results Findings & Conclusion ¶

Recommendations or Further Works ¶

References ¶