Jovian
⭐️
Sign In
Learn data science and machine learning by building real-world projects on Jovian

South Africa - Crime Data Visualization

import Packages

In [1]:
import warnings
warnings.filterwarnings('ignore')

import os
import ee
import geemap
import requests

import numpy as np
import pandas as pd
import geopandas as gpd
from geopandas.tools import sjoin

from ipyleaflet import (Choropleth, Marker, Popup, WidgetControl)
from ipywidgets import (HTML, Text)

from matplotlib import pyplot as plt
from matplotlib import style

style.use('seaborn-deep')
In [2]:
ee.Initialize()

ORS Token

In [3]:
with open(file='ors_token.txt', mode='r') as ors:
    api_key = ors.read()

Geocoder

In [4]:
def geocode(place, api_key):
    url = 'https://api.openrouteservice.org/geocode/search?api_key={}&text={}'
    url = url.format(api_key, place)
    
    headers = {
        'Accept' : 'application/json, application/geo+json, application/gpx+xml, img/png; charset=utf-8',
    }

    req = requests.get(url=url, headers=headers)
    print('Geocode response status - {} : {} {}'.format(place, req.status_code, req.reason))

    if (req.status_code == 200):
        req_data = req.json()
        features = req_data['features']
        # format - [longitude, latitude]
        coords = features[0]['geometry']['coordinates']
        return coords

    return None
In [5]:
popn_province_df = pd.read_csv(filepath_or_buffer='Crimes_SA/ProvincePopulation.csv')
sa_crimes_df = pd.read_csv(filepath_or_buffer='Crimes_SA/SouthAfricaCrimeStats_v2.csv')
police_bounds_gdf = gpd.read_file(filename='Crimes_SA/Police_points.shp')
In [6]:
sa_crimes_df.head()
Out[6]:
In [7]:
police_bounds_gdf['COMPNT_NM'] = police_bounds_gdf['COMPNT_NM'].apply(lambda x: x.title())
police_bounds_gdf['CREATE_YEAR'] = police_bounds_gdf['CREATE_DT'].apply(lambda x: str(x)[:4])
In [8]:
print('Population Province - shape : ', popn_province_df.shape)
print('SA Crimes Data - shape : ', sa_crimes_df.shape)
print('Police Bounds Data - shape : ', police_bounds_gdf.shape)
Population Province - shape : (9, 4) SA Crimes Data - shape : (30861, 14) Police Bounds Data - shape : (1143, 7)

Additional Data

SA - Country, States, Districts

In [9]:
sa_country = gpd.read_file(filename='south_africa/south_africa.shp')
sa_states = gpd.read_file(filename='ZAF_adm1/ZAF_adm1.shp')
sa_districts = gpd.read_file(filename='ZAF_adm2/ZAF_adm2.shp')
In [10]:
sa_districts.head()
Out[10]:
In [11]:
sa_districts.drop(columns=['NL_NAME_2', 'VARNAME_2'], inplace=True)
sa_districts.dropna(axis=0, inplace=True)
In [12]:
sa_districts.head()
Out[12]:

Converting .shp files into ee objects

In [13]:
sa_country_ee = geemap.geopandas_to_ee(gdf=sa_country)
sa_states_ee = geemap.geopandas_to_ee(gdf=sa_states)
# sa_districts_ee = geemap.geopandas_to_ee(gdf=sa_districts)
In [14]:
location = geocode(place='south africa', api_key=api_key)
location = location[::-1]
Geocode response status - south africa : 200 OK

South Africa - Provinces

In [15]:
m = geemap.Map(location=location, zoom=5)

m.addLayer(ee_object=sa_country_ee, vis_params={}, name='SA Country', shown=False)
m.addLayer(ee_object=sa_states_ee, vis_params={}, name='SA States')
# m.addLayer(ee_object=sa_districts_ee, vis_params={}, name='SA Districts', shown=False)

m
Map(center=[-31.3096, 18.357], controls=(WidgetControl(options=['position', 'transparent_bg'], widget=HBox(chi…

Merging sa_states and popn_province_df

In [16]:
sa_states = sa_states[['NAME_1', 'geometry']]
sa_states.columns = ['Province', 'geometry']
sa_states['Province'] = sa_states['Province'].apply(lambda x: x.title().replace('-', '/'))
sa_provinces_df  = popn_province_df.merge(right=sa_states, on='Province', how='right')
sa_provinces_gdf = gpd.GeoDataFrame(sa_provinces_df)
sa_provinces_gdf = sa_provinces_gdf.set_index(keys='Province')
sa_provinces_gdf.head()
Out[16]:

EDA and Visualization

Population of each province of SA

In [17]:
popn_province_df.plot(x='Province', y='Population', kind='bar')
plt.show()
Notebook Image

Choropleth map visualization of provinces w.r.t population

In [18]:
m = geemap.Map(location=location, zoom=5)

choropleth_layer = Choropleth(
    geo_data=sa_provinces_gdf.geometry.__geo_interface__,
    choro_data=dict(sa_provinces_gdf['Population']),
    style={'fillOpacity': 1.0, "color":"black"}
)

m.add_layer(layer=choropleth_layer)

m
Map(center=[-31.3096, 18.357], controls=(WidgetControl(options=['position', 'transparent_bg'], widget=HBox(chi…

All Crimes

In [19]:
crimes_list = sa_crimes_df['Category'].value_counts().index.to_list()
crimes_list
Out[19]:
['Sexual Offences',
 'Sexual offences as result of police action',
 'Arson',
 'Drug-related crime',
 'Burglary at non-residential premises',
 'All theft not mentioned elsewhere',
 'Malicious damage to property',
 'Burglary at residential premises',
 'Theft out of or from motor vehicle',
 'Driving under the influence of alcohol or drugs',
 'Robbery of cash in transit',
 'Theft of motor vehicle and motorcycle',
 'Commercial crime',
 'Murder',
 'Assault with the intent to inflict grievous bodily harm',
 'Robbery with aggravating circumstances',
 'Shoplifting',
 'Illegal possession of firearms and ammunition',
 'Stock-theft',
 'Attempted murder',
 'Common assault',
 'Robbery at residential premises',
 'Bank robbery',
 'Truck hijacking',
 'Common robbery',
 'Carjacking',
 'Robbery at non-residential premises']

Crimes (single) reported per station in a selected province and year

In [20]:
def get_coords(police_df, station):
    try:
        coords = police_df[police_df['COMPNT_NM'] == station].geometry.values[0]
    except Exception as e:
        coords = None
    return coords

def fetch_crime_by_stations(province, crimes_data, police_data, category, year, threshold=20, showplot=False):
    if not province:
        return None
    
    if not year in range(2005, 2016):
        return None
    
    start_year = year
    end_year = start_year + 1
    year_col = '{}-{}'.format(start_year, end_year)
    
    p_df = crimes_data[crimes_data['Province'] == province]
    pc_df = p_df[p_df['Category'] == category]
    sc_df = pc_df[['Station', year_col]]
    
    sc_df = pc_df.groupby(by='Station').sum()
    sc_df = sc_df.reset_index()
    
    if isinstance(threshold, int):
        sc_df = sc_df[sc_df[year_col] > threshold]
    
    sc_df['geometry'] = [get_coords(police_df=police_data, station=i) for i in sc_df['Station']]
    
    if showplot:
        fig, ax = plt.subplots()
        plt.title('Crime Category : {}; Province : {}'.format(category, province))
        sc_df.plot(x='Station', y=year_col, kind='barh', figsize=(10, 10), ax=ax)
        plt.show()
        return None
    
    return gpd.GeoDataFrame(sc_df)

Murders in the year 2006

In [21]:
fetch_crime_by_stations(
    province='Western Cape',
    crimes_data=sa_crimes_df,
    police_data=police_bounds_gdf,
    category='Murder',
    year=2006,
    showplot=True
)
Notebook Image

Murders in the year 2010

In [22]:
fetch_crime_by_stations(
    province='Eastern Cape',
    crimes_data=sa_crimes_df,
    police_data=police_bounds_gdf,
    category='Murder',
    year=2010,
    showplot=True
)
Notebook Image

Plot stations by province

In [23]:
def plot_stations_by_province(province, district_bounds, crimes_data, police_data, category, year):
    start_year = year
    end_year = start_year + 1
    year_col = '{}-{}'.format(start_year, end_year)
    
    sc_gdf = fetch_crime_by_stations(
        province=province,
        crimes_data=crimes_data,
        police_data=police_data,
        category=category,
        year=year
    )
    
    province_districts = district_bounds[district_bounds['NAME_1'] == province]
    province_districts_ee = geemap.geopandas_to_ee(gdf=province_districts)
    province_location = geocode(place=province, api_key=api_key)
    province_location = province_location[::-1]
    
    m = geemap.Map(location=province_location, zoom=7)
    m.addLayer(ee_object=province_districts_ee, vis_params={}, name=province)
    
    for (each_point, each_station, each_count) in zip(sc_gdf['geometry'], sc_gdf['Station'], sc_gdf[year_col]):
        each_loc = [each_point.y, each_point.x]
        each_marker = Marker(
            location=each_loc,
            draggable=False,
            # title='{}; {}'.format(each_station, each_count)
        )
        each_message = HTML()
        each_message.value = '{}; {}'.format(each_station, each_count)
        m.add_layer(each_marker)
        each_marker.popup = each_message
    
    return m
In [24]:
plot_stations_by_province(
    province='Western Cape',
    district_bounds=sa_districts,
    crimes_data=sa_crimes_df,
    police_data=police_bounds_gdf,
    category='Murder',
    year=2006
)
Geocode response status - Western Cape : 200 OK
Map(center=[-33.582933, 19.760643], controls=(WidgetControl(options=['position', 'transparent_bg'], widget=HBo…
In [25]:
plot_stations_by_province(
    province='Eastern Cape',
    district_bounds=sa_districts,
    crimes_data=sa_crimes_df,
    police_data=police_bounds_gdf,
    category='Murder',
    year=2010
)
Geocode response status - Eastern Cape : 200 OK
Map(center=[-32.19228, 26.501313], controls=(WidgetControl(options=['position', 'transparent_bg'], widget=HBox…
In [26]:
# plot_stations_by_province(
#     province='Western Cape',
#     district_bounds=sa_districts,
#     crimes_data=sa_crimes_df,
#     police_data=police_bounds_gdf,
#     category='Murder',
#     year=2012
# )

Plot choropleth map with the count of stations (where crimes have occurred) per district

  • Spatial join operation to check if a station (point) exists in district (polygon).
In [27]:
def plot_choropleth_by_year(province, district_bounds, crimes_data, police_data, category, year):
    start_year = year
    end_year = start_year + 1
    year_col = '{}-{}'.format(start_year, end_year)
    
    sc_gdf = fetch_crime_by_stations(
        province=province,
        crimes_data=crimes_data,
        police_data=police_data,
        category=category,
        year=year
    )
    
    province_districts = district_bounds[district_bounds['NAME_1'] == province]
    province_districts_ee = geemap.geopandas_to_ee(gdf=province_districts)
    province_location = geocode(place=province, api_key=api_key)
    province_location = province_location[::-1]
    
    sinprovince = sjoin(left_df=sc_gdf, right_df=province_districts, how="right")
    sinprovince = sinprovince.dropna(axis=0)
    sinprovince = sinprovince[['Station', 'NAME_2', year_col]]
    sinprovince = sinprovince.groupby(by='NAME_2').sum()
    sinprovince['geometry'] = [
        province_districts[province_districts['NAME_2'] == i]['geometry'].values[0] for i in sinprovince.index
    ]
    sinprovince = gpd.GeoDataFrame(sinprovince)
    
    m = geemap.Map(location=province_location, zoom=7)
    m.addLayer(ee_object=province_districts_ee, vis_params={}, name='{} Districts'.format(province))

    choropleth_layer = Choropleth(
        geo_data=sinprovince.geometry.__geo_interface__,
        choro_data=dict(sinprovince[year_col]),
        style={'fillOpacity': 1.0, "color":"black"}
    )

    m.add_layer(layer=choropleth_layer)
    
    html = HTML('''
        <h4>Province : <b>{}</b></h4>
        Hover over a district
    '''.format(province))
    html.layout.margin = '0px 20px 20px 20px'
    control = WidgetControl(widget=html, position='bottomright')
    
    m.add_control(control)
    def update_html(feature, **kwargs):
        # print(feature)
        html.value = '''
            <h4>Province : <b>{}</b></h4>
            <p>District : <b>{}</b></p>
            <p>Year : <b>{}</b></p>
            <p>Crime category : <b>{}</b></p>
            <p>Cases reported : <b>{}</b></p>
        '''.format(
                province, 
                feature['id'], 
                year_col, 
                category, 
                sinprovince.loc[feature['id']][year_col]
            )

    choropleth_layer.on_hover(callback=update_html)
    
    return m
In [28]:
plot_choropleth_by_year(
    province='Eastern Cape',
    district_bounds=sa_districts,
    crimes_data=sa_crimes_df,
    police_data=police_bounds_gdf,
    category='Murder',
    year=2010
)
Geocode response status - Eastern Cape : 200 OK
Map(center=[-32.19228, 26.501313], controls=(WidgetControl(options=['position', 'transparent_bg'], widget=HBox…
In [29]:
plot_choropleth_by_year(
    province='Gauteng',
    district_bounds=sa_districts,
    crimes_data=sa_crimes_df,
    police_data=police_bounds_gdf,
    category='Murder',
    year=2008
)
Geocode response status - Gauteng : 200 OK
Map(center=[-26.238909, 28.059764], controls=(WidgetControl(options=['position', 'transparent_bg'], widget=HBo…

Plot choropleth map with the sum of all reported crimes in all the years from each district

  • Spatial join operation to check if a station (point) exists in district (polygon).
In [30]:
def fetch_total_crimes_by_category(province, category, crimes_data, police_data):
    province_data = crimes_data[crimes_data['Province'] == province]
    pc_data = province_data[province_data['Category'] == category]
    
    num_cols = list(pc_data._get_numeric_data().columns)
    total_crimes = pc_data[num_cols].sum(axis=1).to_list()
    
    pc_df = pc_data.drop(columns=num_cols, axis=1)
    pc_df['total_crimes'] = total_crimes
    
    sc_df = pc_df.groupby(by='Station').sum()
    sc_df['geometry'] = [get_coords(police_df=police_data, station=i) for i in sc_df.index]
    sc_df = sc_df.reset_index()
    
    return gpd.GeoDataFrame(sc_df)
In [31]:
def plot_choropleth_of_total_crimes(province, category, district_bounds, crimes_data, police_data):
    sc_df = fetch_total_crimes_by_category(
        province=province,
        category=category,
        crimes_data=crimes_data,
        police_data=police_data
    )
    
    province_districts = district_bounds[district_bounds['NAME_1'] == province]
    province_districts_ee = geemap.geopandas_to_ee(gdf=province_districts)
    province_location = geocode(place=province, api_key=api_key)
    province_location = province_location[::-1]
    
    sinprovince = sjoin(left_df=sc_df, right_df=province_districts, how="right")
    sinprovince = sinprovince.dropna(axis=0)
    sinprovince = sinprovince.reset_index(drop=True)
    sinprovince = sinprovince[['Station', 'NAME_2', 'total_crimes']]
    sinprovince = sinprovince.groupby(by='NAME_2').sum()
    sinprovince['geometry'] = [
        province_districts[province_districts['NAME_2'] == i]['geometry'].values[0] for i in sinprovince.index
    ]
    sinprovince = gpd.GeoDataFrame(sinprovince)
    
    m = geemap.Map(location=province_location, zoom=7)
    m.addLayer(ee_object=province_districts_ee, vis_params={}, name='{} Districts'.format(province))

    choropleth_layer = Choropleth(
        geo_data=sinprovince.geometry.__geo_interface__,
        choro_data=dict(sinprovince['total_crimes']),
        style={'fillOpacity': 1.0, "color":"black"}
    )

    m.add_layer(layer=choropleth_layer)
    
    html = HTML('''
        <h4>Province : <b>{}</b></h4>
        Hover over a district
    '''.format(province))
    html.layout.margin = '0px 20px 20px 20px'
    control = WidgetControl(widget=html, position='bottomright')
    
    m.add_control(control)
    def update_html(feature, **kwargs):
        # print(feature)
        html.value = '''
            <h4>Province : <b>{}</b></h4>
            <p>District : <b>{}</b></p>
            <p>Crime category : <b>{}</b></p>
            <p>Cases reported : <b>{}</b></p>
        '''.format(
                province, 
                feature['id'],  
                category, 
                sinprovince.loc[feature['id']]['total_crimes']
            )

    choropleth_layer.on_hover(callback=update_html)
    
    return m
In [32]:
plot_choropleth_of_total_crimes(
    province='Western Cape', 
    category='Murder', 
    crimes_data=sa_crimes_df,
    district_bounds=sa_districts,
    police_data=police_bounds_gdf
)
Geocode response status - Western Cape : 200 OK
Map(center=[-33.582933, 19.760643], controls=(WidgetControl(options=['position', 'transparent_bg'], widget=HBo…

End