Movie Ratings Data Analysis - Notebook by Aman Jain (aman-jain)

Learn practical skills, build real-world projects, and advance your career

Updated 2 years ago

import pandas as pd
import numpy as np

# getting the movies and the ratings csv
df_movies = pd.read_csv('ml-25m/movies.csv')
df_ratings = pd.read_csv('ml-25m/ratings.csv')

# string manipulation functions for data cleaning
def split_year(x):
    x = x.split('(')
    for i in x:
        check = i.split(')')[0]
        if check.isnumeric() and len(check) == 4:
            return int(check)
    return -1

def remove_year(x):
    y = x.split('(')
    for i in y:
        check = i.split(')')[0]
        if check.isnumeric() and len(check) == 4:
            string = f'({check})'
            return x.replace(string, '')
    return ''

# splits for each chunk to be created (user ids were chosen such that each at each split, no particular user gets 
# split into different chunk)
ids = [1, 16658, 32564, 48690, 64883, 80914, 97160, 113417, 130048, 146281, 162541]

# split the data according to the ids above into a python list
dfs = []

for i in range(len(ids)):
    # define the start and end of each split
    if i+1 < len(ids):
        start = ids[i]
        id_split = ids[i+1]
    else:
        start = ids[i]
        id_split = ids[i] + 1
    # append the dataframes to the list based on the split
    dfs.append(df_ratings.loc[np.logical_and(df_ratings['userId'] >= start, df_ratings['userId'] < id_split)])