import ujson, requests, pickle
from pathlib import Path
MODEL_DIR = "../models"
vectorizer = None #Variable in global name space
model = None #Variable in global name space
def read_jsonl(file_path):
with Path(file_path).open('r', encoding='utf8') as infile:
for line in infile:
try:
yield ujson.loads(line.strip())
except ValueError:
print("Error in reading a jsonline")
continue
def write_jsonl(file_path, lines):
data = [ujson.dumps(line, escape_forward_slashes=False) for line in lines]
Path(file_path).open('w', encoding='utf8').write('\n'.join(data))
def load_model():
'''
Initialize the global variables to load the model.
'''
global vectorizer, model
vectorizer = pickle.load(open(MODEL_DIR + "/tfidf_vectorizer.pkl", "rb"))
model = pickle.load(open(MODEL_DIR + "/intent_clf.pkl", "rb"))
load_model()
def predict(text: str):
'''
Provide the prediction and the score for the given text
'''
query_term_matrix = vectorizer.transform([text])
raw_prediction = model.predict([query_term_matrix.A[0]])
prediction = raw_prediction[0].split('-')[0] #Strips Accept from the intent
raw_prob = model.predict_proba([query_term_matrix.A[0]])
probability = raw_prob[0];
confidence_score = round(max(probability), 2)
return (prediction, confidence_score)
vectorizer.transform??
text = 'Pod CrashLoopBackOff error'
doc_term_matrix = vectorizer.transform([text])
print(doc_term_matrix.shape);doc_term_matrix
(1, 3394)
print(doc_term_matrix.A);print(doc_term_matrix.A[0])
[[0. 0. 0. ... 0. 0. 0.]]
[0. 0. 0. ... 0. 0. 0.]
raw_prediction = model.predict([doc_term_matrix.A[0]])
prediction = raw_prediction[0].split('-')[0];prediction
'TROUBLESHOOT'
model.predict_proba??
raw_prob = model.predict_proba([doc_term_matrix.A[0]])
probability = raw_prob[0];
confidence_score = round(max(probability), 2); print(confidence_score)
1.0
Load the q, url, count and fetch the title corresponding to the clicked url and add it to the above data.
data = read_jsonl('data_sample.jsonl')
lines = [line for line in data]
import urllib3
#Suppress InsecureRequestWarning
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
DEFAULT_SEARCH_SERVICE_URI = 'https://api.access.redhat.com/rs/search'
ROWS = 50
ENABLE_SIGNALS = 'false'
ENABLE_ELEVATION = 'false' #Disable elevation as we want to evaluate against the natural search results
headers = {'Accept': 'application/vnd.redhat.solr+json'}
def update_params(query: str):
params = {
'q': query, 'wt': 'json', 'start': 0, 'rows': 1,
'enableSignals': ENABLE_SIGNALS, 'enableElevation': ENABLE_ELEVATION,
'fl': 'view_uri, allTitle'
}
return params
def execute_search(query: str):
params = update_params(query)
response = requests.get(url=DEFAULT_SEARCH_SERVICE_URI, params=params,
headers=headers, verify=False)
if (response.status_code != 200):
print(response.text)
return dict()
#print(response.text)
data = ujson.loads(response.text)
docs = data.get('response', {}).get('docs', [])
if len(docs) > 0:
url = docs[0]['view_uri']
title = docs[0]['allTitle']
return (url, title)
return ("", "")
urls = [entry.get('doc_id_s') for entry in lines]
results = [execute_search(f'view_uri:"{url}"') for url in urls]
for line in lines:
url = line['doc_id_s']
title = result_dict.get(url, '')
line['title'] = title
write_jsonl('data_sample.jsonl.out', lines)
Heuristics - We can run both the query against the current model, get the intent and score - say A and the title against the current model, get the intent & score - say B. We can take a max(A,B).
sample_clickstream_data = [entry for entry in read_jsonl('data_sample.jsonl.out')]
len(sample_clickstream_data)
100
sample_clickstream_data[:1]
[{'query_s': 'systemd',
'doc_id_s': 'https://access.redhat.com/articles/754933',
'aggr_count_i': 40,
'title': 'Overview of systemd for RHEL 7'}]
predict('rhel')
('USAGE', 0.93)
print('Query prediction :{}'.format(predict(sample_clickstream_data[0].get('query_s'))))
Query prediction :('TROUBLESHOOT', 0.62)
print('Title prediction :{}'.format(predict(sample_clickstream_data[0].get('title'))))
Title prediction :('USAGE', 0.56)
for each in sample_clickstream_data:
query_intent, query_prediction_score = predict(each.get('query_s'))
title_intent, title_prediction_score = predict(each.get('title'))
each['query_intent'], each['query_prediction_score'] = query_intent, query_prediction_score
each['title_intent'], each['title_prediction_score'] = title_intent, title_prediction_score
sample_clickstream_data[0]
{'query_s': 'systemd',
'doc_id_s': 'https://access.redhat.com/articles/754933',
'aggr_count_i': 40,
'title': 'Overview of systemd for RHEL 7',
'query_intent': 'TROUBLESHOOT',
'query_prediction_score': 0.62,
'title_intent': 'USAGE',
'title_prediction_score': 0.56}
write_jsonl('data_with_predictions.jsonl.out', sample_clickstream_data)
!ls
data.json prepate-data-click-heuristics.ipynb
data.jsonl troubleshoot-usage-cases-18.jsonl
data_sample.jsonl troubleshoot-usage-cases.jsonl
data_sample.jsonl.out troubleshoot-usage-cases-manual.jsonl
data_with_predictions.jsonl.out troubleshoot-usage-kb.jsonl
data_with_predictions.json.out
import pandas as pd
df = pd.read_json('data_with_predictions.jsonl.out', lines=True)
df.head()
out_df = df[['query_s', 'query_intent', 'query_prediction_score', 'title', 'title_intent', 'title_prediction_score']];out_df.head()
out_df.head(100)
import jovian
jovian.commit()
[jovian] Saving notebook..