Jovian
⭐️
Sign In
In [14]:
import ujson, requests, pickle
from pathlib import Path
In [17]:
MODEL_DIR = "../models"
vectorizer = None #Variable in global name space 
model = None #Variable in global name space 

Utils

In [2]:
def read_jsonl(file_path):
    with Path(file_path).open('r', encoding='utf8') as infile:
        for line in infile:
            try:
                yield ujson.loads(line.strip())
            except ValueError:
                print("Error in reading a jsonline")
                continue
In [3]:
def write_jsonl(file_path, lines):
    data = [ujson.dumps(line, escape_forward_slashes=False) for line in lines]
    Path(file_path).open('w', encoding='utf8').write('\n'.join(data))
In [18]:
def load_model():
    '''
    Initialize the global variables to load the model.
    '''
    global vectorizer, model
    vectorizer = pickle.load(open(MODEL_DIR + "/tfidf_vectorizer.pkl", "rb")) 
    model = pickle.load(open(MODEL_DIR + "/intent_clf.pkl", "rb"))
In [19]:
load_model()
In [54]:
def predict(text: str):
    '''
    Provide the prediction and the score for the given text
    '''
    query_term_matrix = vectorizer.transform([text])
    raw_prediction = model.predict([query_term_matrix.A[0]])
    prediction = raw_prediction[0].split('-')[0] #Strips Accept from the intent
    
    raw_prob = model.predict_proba([query_term_matrix.A[0]])
    probability = raw_prob[0];
    confidence_score = round(max(probability), 2)
    return (prediction, confidence_score)
In [47]:
vectorizer.transform??
In [23]:
text = 'Pod CrashLoopBackOff error'

doc_term_matrix = vectorizer.transform([text])
In [39]:
print(doc_term_matrix.shape);doc_term_matrix
Out[39]:
(1, 3394)
In [38]:
print(doc_term_matrix.A);print(doc_term_matrix.A[0])
[[0. 0. 0. ... 0. 0. 0.]] [0. 0. 0. ... 0. 0. 0.]
In [41]:
raw_prediction = model.predict([doc_term_matrix.A[0]])
In [44]:
prediction = raw_prediction[0].split('-')[0];prediction
Out[44]:
'TROUBLESHOOT'
In [48]:
model.predict_proba??
In [50]:
raw_prob = model.predict_proba([doc_term_matrix.A[0]])
In [53]:
probability = raw_prob[0];
confidence_score = round(max(probability), 2); print(confidence_score)
1.0

Load and prepare the data

Load the q, url, count and fetch the title corresponding to the clicked url and add it to the above data.

In [4]:
data = read_jsonl('data_sample.jsonl')
In [5]:
lines = [line for line in data]
In [6]:
import urllib3
#Suppress InsecureRequestWarning
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
DEFAULT_SEARCH_SERVICE_URI = 'https://api.access.redhat.com/rs/search'
ROWS = 50
ENABLE_SIGNALS = 'false' 
ENABLE_ELEVATION = 'false' #Disable elevation as we want to evaluate against the natural search results
headers = {'Accept': 'application/vnd.redhat.solr+json'}

def update_params(query: str):
    params = {
        'q': query, 'wt': 'json', 'start': 0, 'rows': 1,        
        'enableSignals': ENABLE_SIGNALS, 'enableElevation': ENABLE_ELEVATION,
        'fl': 'view_uri, allTitle'
    }    
    return params
In [7]:
def execute_search(query: str):
    params = update_params(query)
    response = requests.get(url=DEFAULT_SEARCH_SERVICE_URI, params=params, 
             headers=headers, verify=False)
    if (response.status_code != 200):
        print(response.text)        
        return dict()
    #print(response.text)
    data = ujson.loads(response.text)
    docs = data.get('response', {}).get('docs', [])
    if len(docs) > 0:
        url = docs[0]['view_uri']
        title = docs[0]['allTitle']
        return (url, title)
    return ("", "")
    
In [8]:
urls = [entry.get('doc_id_s') for entry in lines]
In [33]:
results = [execute_search(f'view_uri:"{url}"') for url in urls]
In [45]:
for line in lines:
    url = line['doc_id_s']
    title = result_dict.get(url, '')
    line['title'] = title
In [49]:
write_jsonl('data_sample.jsonl.out', lines)

Apply the heuristics

Heuristics - We can run both the query against the current model, get the intent and score - say A and the title against the current model, get the intent & score - say B. We can take a max(A,B).

In [12]:
sample_clickstream_data = [entry for entry in read_jsonl('data_sample.jsonl.out')]
In [13]:
len(sample_clickstream_data)
Out[13]:
100
In [58]:
sample_clickstream_data[:1]
Out[58]:
[{'query_s': 'systemd',
  'doc_id_s': 'https://access.redhat.com/articles/754933',
  'aggr_count_i': 40,
  'title': 'Overview of systemd for RHEL 7'}]
In [56]:
predict('rhel')
Out[56]:
('USAGE', 0.93)
In [61]:
print('Query prediction :{}'.format(predict(sample_clickstream_data[0].get('query_s'))))
Query prediction :('TROUBLESHOOT', 0.62)
In [62]:
print('Title prediction :{}'.format(predict(sample_clickstream_data[0].get('title'))))
Title prediction :('USAGE', 0.56)
In [65]:
for each in sample_clickstream_data:
    query_intent, query_prediction_score = predict(each.get('query_s'))
    title_intent, title_prediction_score = predict(each.get('title'))
    each['query_intent'], each['query_prediction_score'] = query_intent, query_prediction_score
    each['title_intent'], each['title_prediction_score'] = title_intent, title_prediction_score
In [66]:
sample_clickstream_data[0]
Out[66]:
{'query_s': 'systemd',
 'doc_id_s': 'https://access.redhat.com/articles/754933',
 'aggr_count_i': 40,
 'title': 'Overview of systemd for RHEL 7',
 'query_intent': 'TROUBLESHOOT',
 'query_prediction_score': 0.62,
 'title_intent': 'USAGE',
 'title_prediction_score': 0.56}
In [68]:
write_jsonl('data_with_predictions.jsonl.out', sample_clickstream_data)
In [69]:
!ls
data.json prepate-data-click-heuristics.ipynb data.jsonl troubleshoot-usage-cases-18.jsonl data_sample.jsonl troubleshoot-usage-cases.jsonl data_sample.jsonl.out troubleshoot-usage-cases-manual.jsonl data_with_predictions.jsonl.out troubleshoot-usage-kb.jsonl data_with_predictions.json.out
In [71]:
import pandas as pd
In [72]:
df = pd.read_json('data_with_predictions.jsonl.out', lines=True)
In [73]:
df.head()
Out[73]:
In [81]:
out_df = df[['query_s', 'query_intent', 'query_prediction_score', 'title', 'title_intent', 'title_prediction_score']];out_df.head()
Out[81]:
In [82]:
out_df.head(100)
Out[82]:
In [75]:
import jovian
In [ ]:
jovian.commit()
[jovian] Saving notebook..
In [ ]: