Learn practical skills, build real-world projects, and advance your career
import pandas as pd
import numpy as np
import vaex as vx
import datetime
# get the ratings data
df_ratings = vx.from_csv('ml-25m/ratings.csv')
# get the movies data
df_movies = vx.read_csv('ml-25m/movies.csv')
# string manipulation functions for data cleaning
def split_year(x):
    x = x.split('(')
    for i in x:
        check = i.split(')')[0]
        if check.isnumeric() and len(check) == 4:
            return int(check)
    return -1

def remove_year(x):
    y = x.split('(')
    for i in y:
        check = i.split(')')[0]
        if check.isnumeric() and len(check) == 4:
            string = f'({check})'
            return x.replace(string, '')
    return ''
# convert timestamp to date
df_ratings['date'] = df_ratings['timestamp'].apply(lambda x : datetime.datetime.fromtimestamp(x).strftime('%Y-%m-%d'))
df_ratings = df_ratings.join(df_movies, left_on='movieId', right_on='movieId')