Jovian
⭐️
Sign In
In [1]:
import pprint
import csv
from math import *
In [2]:
lines=list(csv.reader(open('datasets/3train.csv','r')))
lines
Out[2]:
[['Outlook', 'Temperature', 'Humidity', 'Wind', 'Target'],
 ['sunny', 'hot', 'high', 'weak', 'no'],
 ['sunny', 'hot', 'high', 'strong', 'no'],
 ['overcast', 'hot', 'high', 'weak', 'yes'],
 ['rain', 'mild', 'high', 'weak', 'yes'],
 ['rain', 'cool', 'normal', 'weak', 'yes'],
 ['rain', 'cool', 'normal', 'strong', 'no'],
 ['overcast', 'cool', 'normal', 'strong', 'yes'],
 ['sunny', 'mild', 'high', 'weak', 'no'],
 ['sunny', 'cool', 'normal', 'weak', 'yes'],
 ['rain', 'mild', 'normal', 'weak', 'yes'],
 ['sunny', 'mild', 'normal', 'strong', 'yes'],
 ['overcast', 'mild', 'high', 'strong', 'yes'],
 ['overcast', 'hot', 'normal', 'weak', 'yes'],
 ['rain', 'mild', 'high', 'strong', 'no']]
In [3]:
data=lines.pop(0)
print(data)
print()
print(lines)
['Outlook', 'Temperature', 'Humidity', 'Wind', 'Target'] [['sunny', 'hot', 'high', 'weak', 'no'], ['sunny', 'hot', 'high', 'strong', 'no'], ['overcast', 'hot', 'high', 'weak', 'yes'], ['rain', 'mild', 'high', 'weak', 'yes'], ['rain', 'cool', 'normal', 'weak', 'yes'], ['rain', 'cool', 'normal', 'strong', 'no'], ['overcast', 'cool', 'normal', 'strong', 'yes'], ['sunny', 'mild', 'high', 'weak', 'no'], ['sunny', 'cool', 'normal', 'weak', 'yes'], ['rain', 'mild', 'normal', 'weak', 'yes'], ['sunny', 'mild', 'normal', 'strong', 'yes'], ['overcast', 'mild', 'high', 'strong', 'yes'], ['overcast', 'hot', 'normal', 'weak', 'yes'], ['rain', 'mild', 'high', 'strong', 'no']]
In [4]:
def entropy(pos,neg):
    if pos==0 or neg==0:
        return 0
    tot=pos+neg
    return -pos/tot*log(pos/tot,2)-neg/tot*log(neg/tot,2)
In [5]:
def gain(lines,attr,pos,neg):
    d,E,acu={},entropy(pos,neg),0
    for i in lines:
        if i[attr] not in d:
            d[i[attr]]={}
        d[i[attr]][i[-1]]=1+d[i[attr]].get(i[-1],0)
    for i in d:
        tot=d[i].get('yes',0)+d[i].get('no',0)
        acu+= tot/(pos+neg)*entropy(d[i].get('yes',0),d[i].get('no',0))
    return E-acu
In [6]:
def build(lines,data):
    pos=len([x for x in lines if x[-1]=='yes'])
    sz=len(lines[0])-1
    neg=len(lines)-pos
    
    if neg==0 or pos==0:
        return 'yes' if neg==0 else 'no'
    
    root=max([[gain(lines,i,pos,neg),i] for i in range(sz)])[1]
    
    fin,res={},{}
    uniq_attr=set([x[root] for x in lines])
    print(">>>",uniq_attr)
    
    for i in uniq_attr:
        res[i]=build([x[:root]+x[root+1:] for x in lines if x[root]==i],data[:root]+data[root+1:])
        
    fin[data[root]]=res
    return fin
In [7]:
tree=build(lines,data)
pprint.pprint(tree)
>>> {'sunny', 'rain', 'overcast'} >>> {'high', 'normal'} >>> {'weak', 'strong'} {'Outlook': {'overcast': 'yes', 'rain': {'Wind': {'strong': 'no', 'weak': 'yes'}}, 'sunny': {'Humidity': {'high': 'no', 'normal': 'yes'}}}}
In [8]:
def classify(instance, tree,default=None): # Instance of Play Tennis with Predicted    
    attribute = next(iter(tree)) # Outlook/Humidity/Wind       
    if instance[attribute] in tree[attribute].keys(): # Value of the attributs in  set of Tree keys  
        result = tree[attribute][instance[attribute]]
        if isinstance(result, dict): # this is a tree, delve deeper
            return classify(instance, result)
        else:
            return result # this is a label
    else:
        return default
In [9]:
import pandas as pd
df_new=pd.read_csv('datasets/3test.csv')
df_new['predicted'] = df_new.apply(classify, axis=1, args=(tree,'?')) 
print(df_new)
Outlook Temperature Humidity Wind predicted 0 rain cool normal strong no 1 sunny mild normal strong yes
In [ ]: