Trial 001 Adam - Notebook by deepanshu vishwakarma (deep28vish)

Learn practical skills, build real-world projects, and advance your career

Created 5 years ago

import numpy as np
import h5py
import matplotlib.pyplot as plt
import math

def load_data():
    train_dataset = h5py.File('datasets/train_catvnoncat.h5', "r")
    train_set_x_orig = np.array(train_dataset["train_set_x"][:]) # your train set features
    train_set_y_orig = np.array(train_dataset["train_set_y"][:]) # your train set labels

    test_dataset = h5py.File('datasets/test_catvnoncat.h5', "r")
    test_set_x_orig = np.array(test_dataset["test_set_x"][:]) # your test set features
    test_set_y_orig = np.array(test_dataset["test_set_y"][:]) # your test set labels

    classes = np.array(test_dataset["list_classes"][:]) # the list of classes
    
    train_set_y_orig = train_set_y_orig.reshape((1, train_set_y_orig.shape[0]))
    test_set_y_orig = test_set_y_orig.reshape((1, test_set_y_orig.shape[0]))
    
    return train_set_x_orig, train_set_y_orig, test_set_x_orig, test_set_y_orig, classes

def sigmoid(Z):
    A = 1/ ( 1 + np.exp(-Z))
    cache = Z
    
    return A, cache

def relu(Z):        
    A = np.maximum(0,Z)    
    assert(A.shape == Z.shape)    
    cache = Z 
    return A, cache

def initialize_parameters_deep(layer_dims):
    np.random.seed(3)
    parameters = {}
    L = len(layer_dims)    
    for l in range(1,L):
        parameters['W' + str(l)] = np.random.randn(layer_dims[l], layer_dims[l-1]) * 0.001
        parameters['b' + str(l)] = np.zeros((layer_dims[l],1))        
    return parameters

def initialize_parameters_deep_w(layer_dims):
    np.random.seed(3)
    parameters = {}
    L = len(layer_dims)    
    for l in range(1,L-1):
        parameters['W' + str(l)] = np.random.randn(layer_dims[l], layer_dims[l-1]) * 00.1
        parameters['b' + str(l)] = np.zeros((layer_dims[l],1))        
    parameters['W' + str(L-1)] = np.random.randn(layer_dims[L-1], layer_dims[L-2]) * np.sqrt(2/layer_dims[L-2])
    parameters['b' + str(L-1)] = np.zeros((layer_dims[L-1],1))         
    return parameters

def linear_forward(A, W, b):    
    Z = np.dot(W, A) + b    
    cache = [A, W, b]    
    return Z, cache

def linear_activation_forward(A_prev, W, b, activation):
    linear_cache = tuple()
    activation_cache = tuple()
    A = A_prev.shape   

    if activation == 'sigmoid':        
        Z, linear_cache = linear_forward(A_prev, W, b) # Z = A_prev.W + b, activation_cache = A_prev, W, b        
        A, activation_cache = sigmoid(Z) # A = sigmoid(Z) , activation_cache = Z        
    elif activation == 'relu':        
        Z, linear_cache = linear_forward(A_prev, W, b) # Z  = A_prev.W + b, activation_cacahe = A_pre, W, b        
        A, activation_cache = relu(Z)        
    cache = [linear_cache, activation_cache]    
    return A, cache

def L_model_forward(X, para):    
    Caches = []    
    A = X    
    L = len(para) // 2    
    for l in range(1,L):
        A_prev = A        
        A, cache = linear_activation_forward(A_prev, para['W'+ str(l)], para['b'+ str(l)], activation = 'relu')
        Caches.append(cache)       
    AL, cache = linear_activation_forward(A, para['W' + str(L)], para['b' + str(L)], activation = 'sigmoid')
    Caches.append(cache)    
    return AL, Caches

def L_model_forward_drop(X, para, dp):    
    Caches = []    
    A = X    
    L = len(para) // 2    
    for l in range(1,L):
        A_prev = A        
        A, cache = linear_activation_forward(A_prev, para['W'+ str(l)], para['b'+ str(l)], activation = 'relu')
        d = np.random.rand(A.shape[0], A.shape[1])
        d = d < dp[l-1]
        A = np.multiply(A, d)
        A = A / dp[l-1]
        Caches.append(cache)       
    AL, cache = linear_activation_forward(A, para['W' + str(L)], para['b' + str(L)], activation = 'sigmoid')
    d = np.random.rand(AL.shape[0], AL.shape[1])
    d = d < dp[L-1]
    A = np.multiply(A, d)
    A = A / dp[L-1]
    Caches.append(cache)    
    return AL, Caches

def compute_cost(AL, Y):    
    m = Y.shape[1]    
    cost = (-1/m) * np.sum( np.multiply(Y, np.log(AL)) + np.multiply((1-Y), np.log(1-AL)))    
    cost = np.squeeze(cost)    
    return cost

def L2_reg_cost(para, Y, lam):    
    m = Y.shape[1]    
    L = len(para) // 2    
    L_cost = np.zeros((1,L))    
    for l in range(1,L+1):
        tw = para['W' + str(l)]
        L_cost[0][l-1] = np.sum(np.square([tw]))    
    COst = np.sum(L_cost)    
    COst = (lam/ (2*m))    
    return COst

def relu_backward(dA, cache):       
    Z = cache
    dZ = np.array(dA, copy=True) # just converting dz to a correct object.    
    # When z <= 0, you should set dz to 0 as well. 
    dZ[Z <= 0] = 0    
    assert (dZ.shape == Z.shape)    
    return dZ

def sigmoid_backward(dA, cache):
    Z = cache    
    s = 1/(1+np.exp(-Z))
    dZ = dA * s * (1-s)    
    assert (dZ.shape == Z.shape)    
    return dZ

def linear_backward(dZ, a_prev_wb):    
    A_prev, W, b = a_prev_wb    
    m = A_prev.shape[1]    
    dW  = (1/m) * np.dot(dZ, A_prev.T)
    db = (1/m) * np.sum(dZ, axis =1 , keepdims = True)
    dA_prev = np.dot(W.T, dZ)    
    assert (dA_prev.shape == A_prev.shape)
    assert (dW.shape == W.shape)
    assert (db.shape == b.shape)    
    return dA_prev, dW, db

def linear_backward_reg(dZ, a_prev_wb, lam):    
    A_prev, W, b = a_prev_wb    
    m = A_prev.shape[1]    
    dW  = (1/m) * np.dot(dZ, A_prev.T) + (lam/m) * W
    db = (1/m) * np.sum(dZ, axis =1 , keepdims = True)
    dA_prev = np.dot(W.T, dZ)    
    assert (dA_prev.shape == A_prev.shape)
    assert (dW.shape == W.shape)
    assert (db.shape == b.shape)    
    return dA_prev, dW, db

def linear_activation_backward(dA, cache, activation):    
    linear_cache, Z = cache # linear_cache = a,w,b    
    if activation == 'relu':
        dZ = relu_backward(dA, Z)        
        dA_prev, dW, db = linear_backward(dZ, linear_cache)         
    elif activation == 'sigmoid':
        dZ = sigmoid_backward(dA, Z)        
        dA_prev, dW, db = linear_backward(dZ, linear_cache)         
    return dA_prev, dW, db

def linear_activation_backward_reg(dA, cache,lam, activation):    
    linear_cache, Z = cache # linear_cache = a,w,b    
    if activation == 'relu':
        dZ = relu_backward(dA, Z)        
        dA_prev, dW, db = linear_backward_reg(dZ, linear_cache,lam)         
    elif activation == 'sigmoid':
        dZ = sigmoid_backward(dA, Z)        
        dA_prev, dW, db = linear_backward_reg(dZ, linear_cache,lam)         
    return dA_prev, dW, db   

def L_model_backward(AL, Y, awb_z):   
    grads = {}
    L = len(awb_z) # the number of layers
    m = AL.shape[1]
    Y = Y.reshape(AL.shape) # after this line, Y is the same shape as AL    
    # Initializing the backpropagation
    ### START CODE HERE ### (1 line of code)
    dAL = - (np.divide(Y, AL) - np.divide(1 - Y, 1 - AL))
    ### END CODE HERE ###
    grads["dA"+ str(L-1)], grads["dW"+str(L)], grads["db"+ str(L)] = linear_activation_backward(dAL, awb_z[L-1], activation='sigmoid')
    for l in range(L-1,0,-1):
        grads["dA"+ str(l-1)], grads["dW"+str(l)], grads["db"+ str(l)] = linear_activation_backward(grads["dA"+str(l)], awb_z[l-1], activation='relu')
    return grads   

def L_model_backward_reg(AL, Y, awb_z, lam):   
    grads = {}
    L = len(awb_z) # the number of layers
    m = AL.shape[1]
    Y = Y.reshape(AL.shape) # after this line, Y is the same shape as AL    
    # Initializing the backpropagation
    ### START CODE HERE ### (1 line of code)
    dAL = - (np.divide(Y, AL) - np.divide(1 - Y, 1 - AL))
    ### END CODE HERE ###
    grads["dA"+ str(L-1)], grads["dW"+str(L)], grads["db"+ str(L)] = linear_activation_backward_reg(dAL, awb_z[L-1],lam, activation='sigmoid')    
    for l in range(L-1,0,-1):
        grads["dA"+ str(l-1)], grads["dW"+str(l)], grads["db"+ str(l)] = linear_activation_backward_reg(grads["dA"+str(l)], awb_z[l-1],lam, activation='relu')        
    return grads 

def update_parameters(parameters, grads, learning_rate):    
    L = len(parameters) // 2
    nw_para = parameters.copy()
    for l in range(L):
        nw_para["W"+ str(l+1)] = parameters["W"+ str(l+1)] - learning_rate*(grads["dW"+ str(l+1)])
        nw_para["b"+ str(l+1)] = parameters["b"+ str(l+1)] - learning_rate*(grads["db"+ str(l+1)])        
    return nw_para

#$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
def update_parameters_with_gd(parameters, grads, learning_rate):    
    L = len(parameters) // 2
    nw_para = parameters.copy()
    for l in range(L):
        nw_para["W"+ str(l+1)] = parameters["W"+ str(l+1)] - learning_rate*(grads["dW"+ str(l+1)])
        nw_para["b"+ str(l+1)] = parameters["b"+ str(l+1)] - learning_rate*(grads["db"+ str(l+1)])        
    return nw_para

def random_mini_batches(X, Y, mini_batch_size, seed = 0):
    np.random.seed(seed)
    m = X.shape[1]
    mini_batches = []
    #shuffle data
    permutation = list(np.random.permutation(m))
    shu_x = X[:, permutation]
    shu_y = Y[:, permutation].reshape((1,m))    
    num_compl = math.floor(m/mini_batch_size)    
    for k in range(0,num_compl):
        mini_batch_X = shu_x[: , k * mini_batch_size : (k+1)* mini_batch_size]
        mini_batch_Y = shu_y[: , k * mini_batch_size : (k+1)* mini_batch_size]        
        mini_batch = (mini_batch_X, mini_batch_Y)        
        mini_batches.append(mini_batch)        
    if m % mini_batch_size != 0:
        mini_batch_X = shu_x[: , num_compl * mini_batch_size:]
        mini_batch_Y = shu_y[: , num_compl * mini_batch_size:]
        mini_batch = (mini_batch_X, mini_batch_Y)
        mini_batches.append(mini_batch)        
    return mini_batches    
#$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$

def initialize_velocity_mom(parameters):    
    L = len(parameters) //2    
    v = {}    
    for l in range(L):
        v["dW" + str(l+1)] = np.zeros(parameters["W" + str(l+1)].shape)
        v["db" + str(l+1)] = np.zeros(parameters["b" + str(l+1)].shape)
    return v

def initialize_S_RMS_PROP(parameters):
    L = len(parameters) //2    
    S = {}    
    for l in range(L):
        S["dW" + str(l+1)] = np.zeros(parameters["W" + str(l+1)].shape)
        S["db" + str(l+1)] = np.zeros(parameters["b" + str(l+1)].shape)
    return S
    
def Update_para_with_momentum(parameters, grads, v, beta1, learning_rate):
    beta = beta1    
    L = len(parameters) // 2    
    for l in range(L):        
        v["dW" + str(l+1)] = beta * v["dW" + str(l+1)] + (1-beta) * grads["dW"+ str(l+1)]
        v["db" + str(l+1)] = beta * v["db" + str(l+1)] + (1-beta) * grads["db"+ str(l+1)]
        
        parameters["W" + str(l+1)] = parameters["W" + str(l+1)] - learning_rate * v["dW" +str(l+1)]
        parameters["b" + str(l+1)] = parameters["b" + str(l+1)] - learning_rate * v["db" +str(l+1)]
        
    return parameters, v

def Update_para_with_RMS(parameters, grads, S, beta2, learning_rate, epsilon):
    L = len(parameters) // 2   
    s_c ={}
    for l in range(L):        
        S["dW" + str(l+1)] = (beta2 * S["dW" + str(l+1)]) + ((1 - beta2) * (grads["dW" + str(l+1)] * grads["dW" + str(l+1)]))
        S["db" + str(l+1)] = (beta2 * S["db" + str(l+1)]) + ((1 - beta2) * (grads["db" + str(l+1)] * grads["db" + str(l+1)]))
        
        s_c["dW" + str(l+1)] = S["dW" + str(l+1)] / (1 - beta2)
        s_c["db" + str(l+1)] = S["db" + str(l+1)] / (1 - beta2)
        
        parameters["W" + str(l+1)] = parameters["W" + str(l+1)] - ((learning_rate * grads["dW" + str(l+1)]) / (np.sqrt(s_c["dW" + str(l+1)] + epsilon)))
        parameters["b" + str(l+1)] = parameters["b" + str(l+1)] - ((learning_rate * grads["db" + str(l+1)]) / (np.sqrt(s_c["db" + str(l+1)] + epsilon)))
                
    return parameters, S

def initialize_adam(parameters):
    L = len(parameters) //2    
    v = {}
    s = {}    
    for l in range(L):
        v["dW" + str(l+1)] = np.zeros(parameters["W" + str(l+1)].shape)
        v["db" + str(l+1)] = np.zeros(parameters["b" + str(l+1)].shape)
        s["dW" + str(l+1)] = np.zeros(parameters["W" + str(l+1)].shape)
        s["db" + str(l+1)] = np.zeros(parameters["b" + str(l+1)].shape)      

    return v,s

def update_parameters_with_adam(parameters, grads, v, s, t, learning_rate, beta1 = 0.9, 
                                 beta2 = 0.999, epsilon = 1e-8):
    L = len(parameters) // 2                 
    v_corrected = {}                         
    s_corrected = {}                        
    
    for l in range(L):
        
        v["dW" + str(l+1)] = beta1 * v["dW" + str(l+1)] + (1 - beta1) * grads["dW" + str(l+1)]
        v["db" + str(l+1)] = beta1 * v["db" + str(l+1)] + (1 - beta1) * grads["db" + str(l+1)]
       
        v_corrected["dW" + str(l+1)] = v["dW" + str(l+1)] / (np.power((1 - beta1), l))
        v_corrected["db" + str(l+1)] = v["db" + str(l+1)] / (np.power((1 - beta1), l))
                
        s["dW" + str(l+1)] = beta2 * s["dW" + str(l+1)] + (1 - beta2) * (grads["dW" + str(l+1)] * grads["dW" + str(l+1)])
        s["db" + str(l+1)] = beta2 * s["db" + str(l+1)] + (1 - beta2) * (grads["db" + str(l+1)] * grads["db" + str(l+1)])
       
        s_corrected["dW" + str(l+1)] = s["dW" + str(l+1)] / (np.power((1 - beta2), l))
        s_corrected["db" + str(l+1)] = s["db" + str(l+1)] / (np.power((1 - beta2), l))
        
        parameters["W" + str(l+1)] = parameters["W" + str(l+1)] - (learning_rate * v_corrected["dW" + str(l+1)]) / (np.sqrt(s_corrected["dW" + str(l+1)] + epsilon))
        parameters["b" + str(l+1)] = parameters["b" + str(l+1)] - (learning_rate * v_corrected["db" + str(l+1)]) / (np.sqrt(s_corrected["db" + str(l+1)] + epsilon))
        
    return parameters, v, s
#########################################################################

import time
import numpy as np
import h5py
import matplotlib.pyplot as plt
import scipy
from PIL import Image
from scipy import ndimage
#from dnn_app_utils_v3 import *
plt.rcParams['figure.figsize'] = (5.0, 4.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'
np.random.seed(1)   
train_x_orig, train_y, test_x_orig, test_y, classes = load_data()
m_train = train_x_orig.shape[0]
num_px = train_x_orig.shape[1]
m_test = test_x_orig.shape[0]
print ("Number of training examples: " + str(m_train))
print ("Number of testing examples: " + str(m_test))
print ("Each image is of size: (" + str(num_px) + ", " + str(num_px) + ", 3)")
print ("train_x_orig shape: " + str(train_x_orig.shape))
print ("train_y shape: " + str(train_y.shape))
print ("test_x_orig shape: " + str(test_x_orig.shape))
print ("test_y shape: " + str(test_y.shape))

train_x_flatten = train_x_orig.reshape(train_x_orig.shape[0], -1).T
test_x_flatten = test_x_orig.reshape(test_x_orig.shape[0], -1).T
train_x = train_x_flatten /255
test_x = test_x_flatten /255

Number of training examples: 209
Number of testing examples: 50
Each image is of size: (64, 64, 3)
train_x_orig shape: (209, 64, 64, 3)
train_y shape: (1, 209)
test_x_orig shape: (50, 64, 64, 3)
test_y shape: (1, 50)