Learn practical skills, build real-world projects, and advance your career
Updated 2 years ago
import pandas as pd
import numpy as np
# getting the movies and the ratings csv
df_movies = pd.read_csv('ml-25m/movies.csv')
df_ratings = pd.read_csv('ml-25m/ratings.csv')
# string manipulation functions for data cleaning
def split_year(x):
x = x.split('(')
for i in x:
check = i.split(')')[0]
if check.isnumeric() and len(check) == 4:
return int(check)
return -1
def remove_year(x):
y = x.split('(')
for i in y:
check = i.split(')')[0]
if check.isnumeric() and len(check) == 4:
string = f'({check})'
return x.replace(string, '')
return ''
# splits for each chunk to be created (user ids were chosen such that each at each split, no particular user gets
# split into different chunk)
ids = [1, 16658, 32564, 48690, 64883, 80914, 97160, 113417, 130048, 146281, 162541]
# split the data according to the ids above into a python list
dfs = []
for i in range(len(ids)):
# define the start and end of each split
if i+1 < len(ids):
start = ids[i]
id_split = ids[i+1]
else:
start = ids[i]
id_split = ids[i] + 1
# append the dataframes to the list based on the split
dfs.append(df_ratings.loc[np.logical_and(df_ratings['userId'] >= start, df_ratings['userId'] < id_split)])