Jovian
⭐️
Sign In
In [1]:
import ujson, requests, pickle
from pathlib import Path
In [2]:
MODEL_DIR = "../models"
vectorizer = None #Variable in global name space 
model = None #Variable in global name space 

Utils

In [47]:
import urllib3
#Suppress InsecureRequestWarning
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
DEFAULT_SEARCH_SERVICE_URI = 'https://api.access.redhat.com/rs/search'
ROWS = 50
ENABLE_SIGNALS = 'false' 
ENABLE_ELEVATION = 'false' #Disable elevation as we want to evaluate against the natural search results
headers = {'Accept': 'application/vnd.redhat.solr+json'}

def update_params(query: str):
    params = {
        'q': query, 'wt': 'json', 'start': 0, 'rows': 1,        
        'enableSignals': ENABLE_SIGNALS, 'enableElevation': ENABLE_ELEVATION,
        'fl': 'view_uri, allTitle'
    }    
    return params
In [48]:
def execute_search(query: str):
    params = update_params(query)
    response = requests.get(url=DEFAULT_SEARCH_SERVICE_URI, params=params, 
             headers=headers, verify=False)
    if (response.status_code != 200):
        print(response.text)        
        return dict()
    #print(response.text)
    data = ujson.loads(response.text)
    docs = data.get('response', {}).get('docs', [])
    if len(docs) > 0:
        url = docs[0]['view_uri']
        title = docs[0]['allTitle']
        return (url, title)
    return ("", "")
In [3]:
def read_jsonl(file_path):
    with Path(file_path).open('r', encoding='utf8') as infile:
        for line in infile:
            try:
                yield ujson.loads(line.strip())
            except ValueError:
                print("Error in reading a jsonline")
                continue
In [4]:
def write_jsonl(file_path, lines):
    data = [ujson.dumps(line, escape_forward_slashes=False) for line in lines]
    Path(file_path).open('w', encoding='utf8').write('\n'.join(data))
In [5]:
def load_model():
    '''
    Initialize the global variables to load the model.
    '''
    global vectorizer, model
    vectorizer = pickle.load(open(MODEL_DIR + "/tfidf_vectorizer.pkl", "rb")) 
    model = pickle.load(open(MODEL_DIR + "/intent_clf.pkl", "rb"))
In [6]:
load_model()
In [7]:
def predict(text: str):
    '''
    Provide the prediction and the score for the given text
    '''
    query_term_matrix = vectorizer.transform([text])
    raw_prediction = model.predict([query_term_matrix.A[0]])
    prediction = raw_prediction[0].split('-')[0] #Strips Accept from the intent
    
    raw_prob = model.predict_proba([query_term_matrix.A[0]])
    probability = raw_prob[0];
    confidence_score = round(max(probability), 2)
    return (prediction, confidence_score)

Load and prepare the data

Load the q, url, count and fetch the title corresponding to the clicked url and add it to the above data.

In [8]:
signals = read_jsonl('signals.jsonl')
In [9]:
lines = [line for line in signals];len(lines)
Out[9]:
889000
In [10]:
import pandas as pd
In [11]:
df = pd.read_json('signals.jsonl', lines=True);df.head()
Out[11]:
In [31]:
#Filter Errata, CVE
filtered_lines = [line for line in lines 
                  if not line['query_s'].startswith(('RHEA', 'CVE', 'RHBA'))];len(filtered_lines)
Out[31]:
888675

Load the url - title map

In [35]:
url_title_map = { line['view_uri'] : line.get('allTitle', "NA") for line in read_jsonl('content.jsonl')}
In [36]:
len(url_title_map.keys())
Out[36]:
311738

Filter any values with 'NA' in url_title_map

In [39]:
url_title_map_filtered = {k:v for k,v in url_title_map.items() if not v == 'NA'};len(url_title_map_filtered)
Out[39]:
311608
In [40]:
content_map = url_title_map_filtered
In [33]:
df = pd.read_json('content.jsonl', lines=True);df.head()
Out[33]:

Now iterate the signals and add title corresponding to the doc clicked

In [44]:
filtered_lines[:10]
Out[44]:
[{'query_s': 'selinux logs',
  'doc_id_s': 'https://access.redhat.com/solutions/62084',
  'id': '000000a6-3962-4346-abf6-0cac76c53e1d',
  'aggr_count_i': 5,
  'title': 'Where can I find SELinux log file ?'},
 {'query_s': 'devtoolset-7-perftools',
  'doc_id_s': 'https://access.redhat.com/errata/RHEA-2017:3008',
  'id': '000003fd-fe1b-4c66-9d5b-60c8a2ec8b5d',
  'aggr_count_i': 5,
  'title': 'NA'},
 {'query_s': 'smp_affinity  core',
  'doc_id_s': 'https://access.redhat.com/solutions/363454',
  'id': '00000f62-cc63-4a74-9c23-efe32147d257',
  'aggr_count_i': 6,
  'title': 'Why are IRQs unbalanced in my multi core, single socket system?'},
 {'query_s': 'openshift online',
  'doc_id_s': 'https://access.redhat.com/solutions/2085783',
  'id': '00001713-fcb3-4b11-98f6-258b8ca1a2a2',
  'aggr_count_i': 5,
  'title': 'OpenShift Online pricing'},
 {'query_s': 'how to add new slots in my satellite 5.7',
  'doc_id_s': 'https://access.redhat.com/solutions/16481',
  'id': '00001a4a-7364-42ef-b3d1-e5ed829d22ee',
  'aggr_count_i': 5,
  'title': 'What is a Satellite Certificate? What do the fields in the certificate mean for Satellite 5.6 or 5.7?'},
 {'query_s': 'logwatch',
  'doc_id_s': 'https://access.redhat.com/solutions/3178001',
  'id': '00002033-eb0b-4a68-8e9e-59a900493cd6',
  'aggr_count_i': 7,
  'title': 'logwatch cache in /var/cache/logwatch is not removed.'},
 {'query_s': '"CPU feature" pku not found',
  'doc_id_s': 'https://access.redhat.com/documentation/en-us/red_hat_enterprise_linux/7/html/7.5_release_notes/',
  'id': '0000225b-cebc-4371-a4d3-44c028e02fbf',
  'aggr_count_i': 5,
  'title': '7.5 Release Notes - Red Hat Enterprise Linux 7 7.5 Release Notes'},
 {'query_s': 'bpms component details',
  'doc_id_s': 'https://access.redhat.com/products/red-hat-jboss-bpm-suite',
  'id': '000027fd-6356-40e8-a663-7430a879e50d',
  'aggr_count_i': 7,
  'title': 'Redirect node for BPM suite'},
 {'query_s': 'service level agreement',
  'doc_id_s': 'https://access.redhat.com/solutions/9296',
  'id': '00002832-3ec6-415b-8d9d-193f5d11d66f',
  'aggr_count_i': 7,
  'title': 'Where can I see a copy of the SLA (Service Level Agreement) for Red Hat Global Support?'},
 {'query_s': 'hyperv-daemons',
  'doc_id_s': 'https://access.redhat.com/downloads/content/hyperv-daemons/0-0.29.20160216git.el7/x86_64/fd431d51/package',
  'id': '00003896-b816-4cb9-8e12-fe7a3c32e6b3',
  'aggr_count_i': 6,
  'title': 'NA'}]
In [45]:
for line in filtered_lines:
    doc = line['doc_id_s']
    title = content_map.get(doc, 'NA')
    line['title'] = title
In [46]:
write_jsonl('raw_data.jsonl', filtered_lines)

The raw_data.json contains lines with no title. It's hard for us to infer the intent with a single word. jq -c '. | select(.title == "NA")' raw_data.jsonl | wc -l 118457

We are going to consider only the click signals data containing the titles. jq -c '. | select(.title != "NA")' raw_data.jsonl | wc -l 770218

Get the predictions

Logic to choose label

  • Take predict(q) -> intentA,scoreA predict(click_doc_title) -> intentB, scoreB then
  • if (intentA == intentB) AND max(scoreA, scoreB) > 85% -> assign(q, intent) as ground truth
  • else if ((intentA == intentB)) and max(scoreA, scoreB) < 85% -> assign (q, intent) as potential ground truth which needs to passed to prodigy for active learning
  • else if (intentA != intentB ), then pass it for manual labeling
In [49]:
#Filtering out click signals that do not have the title
clickstream_data = [entry for entry in read_jsonl('raw_data.jsonl') if not entry['title'] == 'NA']
In [50]:
len(clickstream_data)
Out[50]:
770218
In [56]:
predict('rhel')
Out[56]:
('USAGE', 0.93)
In [52]:
print('Query prediction :{}'.format(predict(clickstream_data[0].get('query_s'))))
Query prediction :('USAGE', 0.63)
In [53]:
print('Title prediction :{}'.format(predict(clickstream_data[0].get('title'))))
Title prediction :('USAGE', 0.62)
In [54]:
for each in clickstream_data:
    query_intent, query_prediction_score = predict(each.get('query_s'))
    title_intent, title_prediction_score = predict(each.get('title'))
    each['query_intent'], each['query_prediction_score'] = query_intent, query_prediction_score
    each['title_intent'], each['title_prediction_score'] = title_intent, title_prediction_score
In [66]:
clickstream_data[0]
Out[66]:
{'query_s': 'systemd',
 'doc_id_s': 'https://access.redhat.com/articles/754933',
 'aggr_count_i': 40,
 'title': 'Overview of systemd for RHEL 7',
 'query_intent': 'TROUBLESHOOT',
 'query_prediction_score': 0.62,
 'title_intent': 'USAGE',
 'title_prediction_score': 0.56}
In [55]:
write_jsonl('raw_data_with_predictions.jsonl', clickstream_data)

Apply the heuristics

Logic to choose label

  • Take predict(q) -> intentA,scoreA predict(click_doc_title) -> intentB, scoreB then
  • if (intentA == intentB) AND max(scoreA, scoreB) > 85% -> assign(q, intent) as ground truth
  • else if ((intentA == intentB)) and max(scoreA, scoreB) < 85% -> assign (q, intent) as potential ground truth which needs to passed to prodigy for active learning
  • else if (intentA != intentB ), then pass it for manual labeling
In [71]:
import pandas as pd
In [56]:
df = pd.read_json('raw_data_with_predictions.jsonl', lines=True)
In [61]:
len(df)
Out[61]:
770218
In [57]:
df.head()
Out[57]:
In [58]:
#Filter the rows with the same query and title intent 
df_with_same_intent = df[df['query_intent'] == df['title_intent']]
In [60]:
len(df_with_same_intent)
Out[60]:
553587
In [62]:
len(df_with_same_intent[df['title_intent'] == 'USAGE'])
/home/msivanes/miniconda3/envs/anlp/lib/python3.6/site-packages/ipykernel_launcher.py:1: UserWarning: Boolean Series key will be reindexed to match DataFrame index. """Entry point for launching an IPython kernel.
Out[62]:
452758
In [ ]:
len(df_with_same_intent[df['title_intent'] == 'TROUBLESHOOT'])

Ground Truth - Predictions with same intent with greater than 85% threshold

We are going to get the predictions scores of the entry with the same intent and if scores are greather than 85% threadshold, then treat them as ground truth

In [69]:
df_with_same_intent['max_prediction_score'] = df[['title_prediction_score','query_prediction_score']].max(axis='columns')
/home/msivanes/miniconda3/envs/anlp/lib/python3.6/site-packages/ipykernel_launcher.py:1: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy """Entry point for launching an IPython kernel.
In [73]:
df_with_same_intent.head()
Out[73]:
In [74]:
ground_truth_df = df_with_same_intent[df_with_same_intent['max_prediction_score'] > 0.85]
In [77]:
len(ground_truth_df)
Out[77]:
283953
In [80]:
final_df = ground_truth_df[['query_s', 'title_intent']]
In [84]:
final = final_df.rename(columns={'query_s': 'text', 'title_intent': 'label'})
In [85]:
final.to_json('ground_truth.jsonl', orient='records', lines=True)

Predictions with same intent that are < 85%

In [88]:
prodigy_candidates_df = df_with_same_intent[df_with_same_intent['max_prediction_score'] < 0.85]
In [105]:
prodigy_candidates = prodigy_candidates_df[['aggr_count_i', 'query_s', 'title', 'title_intent', 'max_prediction_score']]
In [106]:
prodigy_candidates.head(100)
Out[106]:
In [109]:
candidates_20 = prodigy_candidates[prodigy_candidates['aggr_count_i'] > 20];len(candidates_20)
Out[109]:
26658
In [93]:
prodigy_candidates = prodigy_candidates.rename(columns={'query_s': 'text', 'title_intent': 'label'})
In [96]:
prodigy_candidates.head(100);len(prodigy_candidates)
Out[96]:
255908
In [ ]:
prodigy_candidates

Predictions that has different intent for query and clicked doc title

In [98]:
df_with_different_intent = df[df['query_intent'] != df['title_intent']]
In [102]:
df_with_different_intent
Out[102]:
In [107]:
len(df_with_different_intent)
Out[107]:
216631
In [82]:
out_df.head(100)
Out[82]:
In [110]:
import jovian
In [ ]:
jovian.commit()
[jovian] Saving notebook..

Next Steps

  • Do Prodigy active learning for case 2 (try to get more TROUBLESHOOT samples) - Target 1500
  • Do manual labeling for case 3(try to get more TROUBLESHOOT samples) - Target 1500
In [ ]: