Learn practical skills, build real-world projects, and advance your career
from NotebookToggle import hide_toggle

hide_toggle(for_next=True)
# Script Content:
# Second Data Analytics Project
# Descriptive Analysis of OKCupid Profiles Data
# 1. Meta Analysis: Missing Values etc.
# 2. Basic Descriptive Stats
# 3. Essay Length/Ridgeline plots
# 4. Regional Analysis

# Authors: Andrew Boomer, Jacob Pichelmann

# Date: 19.03.2021

#### 0. Setup ####

## import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joypy
import os
import re
from __future__ import print_function
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets
from pandas.api.types import CategoricalDtype
from bs4 import BeautifulSoup
from prettytable import PrettyTable

## set up paths

# Force the correct directory
if os.getcwd().split("/")[-1] == "CODE":
    os.chdir("..")
curr_dir = os.getcwd()

# If an output directory does not already exist, create one
if not os.path.isdir("OUTPUT"):
    os.mkdir("OUTPUT")
out_dir = curr_dir + "/OUTPUT/"


inp_dir = curr_dir + "/DATA/"

## import data

missing_values = ["nan", "-1"]  # account for income missing values = -1
df = pd.read_csv(inp_dir + '3-profiles.csv', na_values=missing_values)

hide_toggle(for_next=True)
#### 1. Meta Analysis ####

sns.heatmap(df.isnull(), cbar=False)

hide_toggle(for_next=True)
Notebook Image
#### 2. Basic Descriptive Stats ####

# 2.1 Numerical variables
# 2.1.1 summary statistics
display(df.describe())

num_cats = ['age', 'income', 'height']

for num_cat in num_cats:
    display(df.groupby(['sex'])[[num_cat]].describe())

# 2.1.2 plots
def plot_line(data, num_cat):
    df.groupby([num_cat, 'sex']).size().unstack().plot()

# 2.2 Categorical variables
# 2.2.1
# what does most common man/women in data set look like?
# create helper function that returns most frequent characteristic

def max_freq(col, data = df):
    freq = data[col].value_counts().reset_index()
    freq_max = freq[freq[col] == max(freq[col])]['index'][0]
    return freq_max


males = df[df['sex'] == 'm']
females = df[df['sex'] == 'f']

cols = ['age', 'body_type', 'diet', 'drinks', 'drugs', 'education', 'ethnicity',
        'height', 'income', 'job']

res_male = []
for col in cols:
    res_male.append(max_freq(col, males))

res_female = []
for col in cols:
    res_female.append(max_freq(col, females))

# put results in a table
hybrid = pd.DataFrame({'F': res_female, 'M': res_male})

display(hybrid) # we can print this in a sentence?

interact(plot_line, data = fixed(df), num_cat = num_cats)

hide_toggle(for_next=True)
interactive(children=(Dropdown(description='num_cat', options=('age', 'income', 'height'), value='age'), Outpu…
# 2.2.2 Categorical plots

# data prep
# generally for levels what about ''?
df['ethnicity_substr'] = df['ethnicity'].str.split().str[0].str.strip(',')

# df['religion_substr'] = df['religion'].str.split().str[0].str.strip('')
df['religion_level'] = df['religion'].apply(lambda x: " ".join(str(x).split(" ")[1:]))
df['religion'] = df['religion'].apply(lambda x: str(x).split(" ")[0])

# df['sign_substr'] = df['sign'].str.split().str[0].str.strip('')
df['sign_level'] = df['sign'].str.replace("’", "'").apply(lambda x: " ".join(str(x).split(" ")[1:]))
df['sign'] = df['sign'].str.replace("’", "'").apply(lambda x: str(x).split(" ")[0])

categories = df[[c for c in df if not c.startswith('essay')]].columns

def plot_bar(data, category):
    plt.figure()
    sns.displot(data[data[category].notna()], x=category, hue="sex", multiple="dodge")
    # plt.savefig(out_dir + "Bar_" + category + ".png", bbox_inches='tight')
    plt.show
    
interact(plot_bar, data=fixed(df), category = categories)

hide_toggle(for_next=True)
interactive(children=(Dropdown(description='category', options=('age', 'body_type', 'diet', 'drinks', 'drugs',…