Learn data science and machine learning by building real-world projects on Jovian

Exploring Foursquare API

In [1]:
import requests
import json
import os
import codecs

from bs4 import BeautifulSoup
In [2]:
class FourSquare:
    def __init__(self):
        self.url = 'https://developer.foursquare.com/docs/build-with-foursquare/categories/'
        self.category_class = 'VenueCategories__Wrapper-sc-1ysxg0y-0 dikXMT'
    
    def scrape_ids(self, write_to_file=False, file_type=None):
        req_data = requests.get(url=self.url)
        req_text = str(req_data.content)
        
        soup_req = BeautifulSoup(markup=req_text, features='lxml')
        category_html = soup_req.find('ul', {'class' : self.category_class})
        
        headings = [i.text.strip() for i in category_html.find_all('h3')]
        category_ids = [i.text.strip() for i in category_html.find_all('p')]
        fourquare_cats = {i : j for (i, j) in zip(headings, category_ids)}
        
        if write_to_file:
            file_type = 'json'
            
            if file_type:
                with open(file='foursquare_venue_categories.json', mode='wb') as jsfile:
                    json.dump(obj=fourquare_cats, fp=codecs.getwriter('utf-8')(jsfile), ensure_ascii=False, indent=2)
            
            print("Successfully written into a file...")
            return None
        
        return fourquare_cats
In [3]:
# fs = FourSquare()
In [4]:
# fs.scrape_ids(write_to_file=True)
In [5]:
with open(file='foursquare_categories_gist.txt', mode='r') as fcg:
    fcg_data = fcg.read()
In [6]:
all_categories = fcg_data.split('\n')
In [7]:
def find_main_categories(data):
    main_category_keys = []
    main_category_values = []
    main_indicies = []
    
    for i in range(len(data)):
        if data[i].startswith('##') and (data[i][2] != '#'):
            main_indicies.append(i)
            main_category_keys.append(data[i][2:])
            main_category_values.append(data[i + 1].split(' ')[1])
    
    main_category_keys_values = dict(zip(main_category_keys, main_category_values))
    return main_category_keys_values, main_indicies
In [8]:
mckv, midx = find_main_categories(data=all_categories)
In [9]:
mckv
Out[9]:
{'Arts & Entertainment': '4d4b7104d754a06370d81259',
 'College & University': '4d4b7105d754a06372d81259',
 'Event': '4d4b7105d754a06373d81259',
 'Food': '4d4b7105d754a06374d81259',
 'Nightlife Spot': '4d4b7105d754a06376d81259',
 'Outdoors & Recreation': '4d4b7105d754a06377d81259',
 'Professional & Other Places': '4d4b7105d754a06375d81259',
 'Residence': '4e67e38e036454776db1fb3a',
 'Shop & Service': '4d4b7105d754a06378d81259',
 'Travel & Transport': '4d4b7105d754a06379d81259'}
In [10]:
midx
Out[10]:
[0, 106, 184, 202, 580, 624, 778, 932, 944, 1240]
In [11]:
def find_submain_categories(data):
    sckv = {}
    sub_indicies = []
    
    for j in range(len(data)):
        if data[j].startswith('###'):
            sub_indicies.append(j)
            sck = data[j][3:].replace('#', '')
            scv = data[j + 1].split(' ')[1]
            sckv[sck] = scv
    
    return sckv, sub_indicies
In [12]:
sckv, sub_indicies = find_submain_categories(data=all_categories)
In [13]:
len(sckv)
Out[13]:
654
In [14]:
len(sub_indicies)
Out[14]:
654
In [15]:
def compute_sub_indicies(data, midx):
    subidxs = []
    
    for i in range(len(midx) - 1):
        s = [(midx[i] + 1), (midx[i + 1] - 1)]
        subidxs.append(s)
    
    subidxs.append([(midx[-1] + 1), len(all_categories) - 1])
    
    return subidxs
In [16]:
subidxs = compute_sub_indicies(data=all_categories, midx=midx)
In [17]:
print(subidxs)
[[1, 105], [107, 183], [185, 201], [203, 579], [581, 623], [625, 777], [779, 931], [933, 943], [945, 1239], [1241, 1328]]
In [18]:
len(subidxs)
Out[18]:
10
In [19]:
def extract_keys_values(mckv, sckv, midx, subidxs, sub_indicies):
    ckv = []
    
    mck = list(mckv.keys())
    mcv = list(mckv.values())
    mckvi = list(zip(midx, mck, mcv))
    
    sck = list(sckv.keys())
    scv = list(sckv.values())
    sckvi = list(zip(sub_indicies, sck, scv))
    
    for i in range(len(midx)):
        mi, mk, mv = mckvi[i]
        
        sub_categories = {}
        for (sj, sk, sv) in sckvi:
            if sj in range(subidxs[i][0], subidxs[i][1]):
                sub_categories[sk] = sv
        
        vt = {
            'main_category' : {mk : mv},
            'sub_categories' : sub_categories
        }
        ckv.append(vt)
    
    return ckv
In [20]:
extracted_categories = extract_keys_values(
    mckv=mckv, 
    sckv=sckv, 
    midx=midx, 
    subidxs=subidxs, 
    sub_indicies=sub_indicies
)
In [21]:
formatted_categories = {
    'foursquare_venues' : extracted_categories
}
In [22]:
with open(file='foursquare_venue_categories.json', mode='w') as ec:
    json.dump(obj=formatted_categories, fp=ec, ensure_ascii=False, indent=2)