Learn practical skills, build real-world projects, and advance your career
Updated 2 years ago
import pandas as pd
import numpy as np
import vaex as vx
import datetime
# get the ratings data
df_ratings = vx.from_csv('ml-25m/ratings.csv')
# get the movies data
df_movies = vx.read_csv('ml-25m/movies.csv')
# string manipulation functions for data cleaning
def split_year(x):
x = x.split('(')
for i in x:
check = i.split(')')[0]
if check.isnumeric() and len(check) == 4:
return int(check)
return -1
def remove_year(x):
y = x.split('(')
for i in y:
check = i.split(')')[0]
if check.isnumeric() and len(check) == 4:
string = f'({check})'
return x.replace(string, '')
return ''
# convert timestamp to date
df_ratings['date'] = df_ratings['timestamp'].apply(lambda x : datetime.datetime.fromtimestamp(x).strftime('%Y-%m-%d'))
df_ratings = df_ratings.join(df_movies, left_on='movieId', right_on='movieId')