Jovian
⭐️
Sign In

This notebook will provide the step by step process for search quality evaluation by computing the avg click rank.

In [1]:
import ujson
from pathlib import Path
import numpy as np
import requests,json

Load the query corpus

In [2]:
def read_jsonl(file_path):
    with Path(file_path).open('r', encoding='utf8') as infile:
        for line in infile:
            try:
                yield ujson.loads(line.strip())
            except ValueError:
                print("Error in reading a jsonline")
                continue
In [36]:
def write_jsonl(file_path, lines):
    data = [ujson.dumps(line, escape_forward_slashes=False) for line in lines]
    Path(file_path).open('w', encoding='utf8').write('\n'.join(data))

The below jsonl file is extracted using solrdump tool followed by jq processing.

  • solrdump -server "{{solrUrl}}/solr/access_signals" -q "timestamp_tdt:[NOW-7DAY/DAY TO NOW]" -fl "query_orig_s,doc_id_s,rank_i,timestamp_tdt"
  • jq -c '. | {q:.q,doc:.doc,rank:.rank}' queries.jsonl | sort > queries.jsonl.out
In [3]:
query_path = '../data/queries.jsonl.out'
In [4]:
query_data = [{'qid': index, 'q': line['q'], 'doc': line['doc'], 'rank': line['rank']} for index, line in enumerate(read_jsonl(query_path))]
In [84]:
len(query_data)
Out[84]:
12611
In [85]:
query_data[-6:-1]
Out[85]:
[{'qid': 12605,
  'q': 'z8 g4',
  'doc': 'https://access.redhat.com/content/4045071',
  'rank': 3},
 {'qid': 12606,
  'q': 'Zabbix',
  'doc': 'https://access.redhat.com/solutions/2179341',
  'rank': 8},
 {'qid': 12607,
  'q': 'zenity',
  'doc': 'https://access.redhat.com/errata/RHBA-2017:2147',
  'rank': 1},
 {'qid': 12608,
  'q': 'zfs',
  'doc': 'https://access.redhat.com/solutions/79633',
  'rank': 1},
 {'qid': 12609,
  'q': 'zones',
  'doc': 'https://access.redhat.com/documentation/en-us/red_hat_enterprise_linux/7/html/security_guide/sec-working_with_zones',
  'rank': 2}]
In [5]:
query_list = [each.get('q') for each in query_data if each['q']]
In [6]:
query_doc = [(each.get('qid'), each.get('q'), each.get('doc')) for each in query_data if each['q']]

Average Click Rank & MRR

This metric measures the average position of clicked items in the search results for the specified dimensions and filters. Its value is greater or equal to 1. Lower values are better. A value of 1 would mean that users always open the first item in the search results list.

In [49]:
b ={'a':1};b.get('c', 0)
Out[49]:
0
In [59]:
ranks = [each.get('rank', 0) for each in query_data if each['rank']]
In [60]:
rank_array = np.array(ranks)
In [61]:
rank_array
Out[61]:
array([ 1,  4,  2, ...,  1,  2, 22])
In [62]:
len(rank_array)
Out[62]:
12533
In [63]:
rank_array.sum()
Out[63]:
54348
In [64]:
rank_array.sum()/len(query_data)
Out[64]:
4.3095710094362065
In [68]:
def mean_reciprocal_rank(ranks):
    """Score is reciprocal of the rank of the first relevant item

    First element is 'rank 1'. 

    Example from http://en.wikipedia.org/wiki/Mean_reciprocal_rank
    Reference: https://github.com/benhamner/Metrics
    >>> ranks = [2, 3, 5, 0, -1]
    >>> 
    >>> mean_reciprocal_rank(ranks)
    0.35    
    Returns:
        Mean reciprocal rank
    """
    count = len(ranks)
    if count == 0:
        return 0
    # skip -1
    valid_ranks = [x for x in ranks if x >= 0]
    sum_reciprocal_ranks = sum([1./(rank + 1) for rank in valid_ranks])
    return sum_reciprocal_ranks/float(count)

In [66]:
mean_reciprocal_rank(rank_array)
Out[66]:
0.31905391328244903

Baseline Avg Click rank: 4.31 and MRR : 0.32

Execute Searches

In [7]:
import urllib3
In [8]:
#Suppress InsecureRequestWarning
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
In [9]:
DEFAULT_SEARCH_SERVICE_URI = 'https://api.access.redhat.com/rs/search'
ROWS = 50
ENABLE_SIGNALS = 'true' 
ENABLE_ELEVATION = 'true' #Disable elevation as we want to evaluate against the natural search results
headers = {'Accept': 'application/vnd.redhat.solr+json'}
In [10]:
def update_params(query: str):
    params = {
        'q': query, 'wt': 'json', 'start': 0, 'rows': ROWS,
        'fq': '''-id:Other AND language:(en) AND -ModerationState:(draft) AND -documentKind:(PortalProduct+OR+ContainerVendor+OR+Packages)''',
        'enableSignals': ENABLE_SIGNALS, 'enableElevation': ENABLE_ELEVATION,
        'fl': 'view_uri, score'
    }    
    return params
In [11]:
def existsInIndex(target_url: str):
    params = {
        'q': 'view_uri:"{}"'.format(target_url), 
        'wt': 'json', 'start': 0, 'rows': 1,       
        'fl': 'view_uri'
    }    
    response = requests.get(url=DEFAULT_SEARCH_SERVICE_URI, params=params, 
             headers=headers, verify=False)
    if (response.status_code != 200):
        print(response.text)        
        return False
    data = json.loads(response.text)
    docs = data.get('response', {}).get('docs', [])
    if len(docs) == 1:
        return docs[0]['view_uri'] == target_url
    return False
In [57]:
class QueryResult:
    
    def __init__(self, qid, rank, score, numFound):
        self.qid = qid
        self.rank = rank
        self.score = score
        self.numFound = numFound
        
    def __repr__(self):
        #Return dictionary representation of the object properties
        return str(self.__dict__)
    
    def __str__(self):
        return "{{qid:{}, rank:{}, score:{}, numFound:{}}}".format(self.qid, self.rank, self.score, self.numFound)
In [58]:
res = QueryResult(1, 2, 3, 4); print(res);res
{qid:1, rank:2, score:3, numFound:4}
Out[58]:
{'qid': 1, 'rank': 2, 'score': 3, 'numFound': 4}
In [59]:
write_jsonl('results.jsonl', [res])
In [60]:
res
Out[60]:
{'qid': 1, 'rank': 2, 'score': 3, 'numFound': 4}
In [61]:
def execute_search(qid: int, query: str, target_url: str):
    
    #Target url does not exist in index so returning -2 -> NOT_IN_INDEX
    if not existsInIndex(target_url):
        return QueryResult(qid, -2, -1, -1)
    
    params = update_params(query)
    response = requests.get(url=DEFAULT_SEARCH_SERVICE_URI, params=params, 
             headers=headers, verify=False)
    if (response.status_code != 200):
        print(response.text)        
        return dict()

    data = json.loads(response.text)
    numFound = data.get('response', -1).get('numFound', -1)
    docs = data.get('response', {}).get('docs', [])

    #Get the results from the search response docs containing view_uri, score
    results = {x['view_uri']:(index+1,x['score']) for index, x in enumerate(docs) if len(docs) > 0}

    #check if target_url exists in the index else return -1 -> NOT_IN_RECALL
    rank,score = results.get(target_url, (-1, -1))
    return QueryResult(qid, rank, score, numFound)  

Valid target_url in Index

In [16]:
result = execute_search(1, 'register rhel', 'https://access.redhat.com/labs/registrationassistant')
In [17]:
result.rank
Out[17]:
1

Invalid target_url

In [14]:
execute_search(1, 'register rhel', 'https://access.redhat.com/labinfo/registrationassistant')
Out[14]:
(-1, -1)

Parallel Processing

In [62]:
from multiprocessing import Pool
In [63]:
num_clients=5
In [64]:
pool = Pool(num_clients)

Time taken for 10 queries

In [97]:
%%time
results = pool.starmap(execute_search, query_doc[1050:1060])
CPU times: user 4.86 ms, sys: 2.48 ms, total: 7.34 ms Wall time: 2.29 s

Time taken for 100 queries

In [65]:
%%time
results = pool.starmap(execute_search, query_doc[1050:1150])
CPU times: user 79.5 ms, sys: 41.4 ms, total: 121 ms Wall time: 32.8 s
In [66]:
results
Out[66]:
[{'qid': 1050, 'rank': 4, 'score': 419.81564, 'numFound': 5443},
 {'qid': 1051, 'rank': 1, 'score': 8662.851, 'numFound': 5443},
 {'qid': 1052, 'rank': -2, 'score': -1, 'numFound': -1},
 {'qid': 1053, 'rank': 5, 'score': 335.35538, 'numFound': 51},
 {'qid': 1054, 'rank': 1, 'score': 81653.234, 'numFound': 377},
 {'qid': 1055, 'rank': 1, 'score': 80814.9, 'numFound': 1562},
 {'qid': 1056, 'rank': 9, 'score': 355.91293, 'numFound': 1109},
 {'qid': 1057, 'rank': 2, 'score': 431.52686, 'numFound': 82},
 {'qid': 1058, 'rank': 1, 'score': 25637.838, 'numFound': 82},
 {'qid': 1059, 'rank': 6, 'score': 109.81958, 'numFound': 256},
 {'qid': 1060, 'rank': 14, 'score': 137.18625, 'numFound': 1738},
 {'qid': 1061, 'rank': -2, 'score': -1, 'numFound': -1},
 {'qid': 1062, 'rank': 1, 'score': 5562.9634, 'numFound': 919},
 {'qid': 1063, 'rank': 2, 'score': 1504.3196, 'numFound': 919},
 {'qid': 1064, 'rank': 2, 'score': 63604.344, 'numFound': 558},
 {'qid': 1065, 'rank': 2, 'score': 9873.246, 'numFound': 186},
 {'qid': 1066, 'rank': 1, 'score': 0, 'numFound': 3},
 {'qid': 1067, 'rank': 1, 'score': 8647.309, 'numFound': 77},
 {'qid': 1068, 'rank': 7, 'score': 34477.773, 'numFound': 7},
 {'qid': 1069, 'rank': 2, 'score': 118.95092, 'numFound': 15},
 {'qid': 1070, 'rank': 1, 'score': 1146.1467, 'numFound': 38},
 {'qid': 1071, 'rank': 4, 'score': 843.7009, 'numFound': 14},
 {'qid': 1072, 'rank': 2, 'score': 639.3539, 'numFound': 26},
 {'qid': 1073, 'rank': 3, 'score': 541.03046, 'numFound': 26},
 {'qid': 1074, 'rank': 1, 'score': 6817.311, 'numFound': 26},
 {'qid': 1075, 'rank': 1, 'score': 0, 'numFound': 5},
 {'qid': 1076, 'rank': 1, 'score': 0, 'numFound': 1},
 {'qid': 1077, 'rank': 3, 'score': 0, 'numFound': 3},
 {'qid': 1078, 'rank': 1, 'score': 25775.7, 'numFound': 14},
 {'qid': 1079, 'rank': 6, 'score': 242.12442, 'numFound': 14},
 {'qid': 1080, 'rank': 8, 'score': 215.84134, 'numFound': 14},
 {'qid': 1081, 'rank': 7, 'score': 234.6094, 'numFound': 14},
 {'qid': 1082, 'rank': 5, 'score': 257.83514, 'numFound': 14},
 {'qid': 1083, 'rank': 3, 'score': 7122.238, 'numFound': 14},
 {'qid': 1084, 'rank': 4, 'score': 283.90503, 'numFound': 14},
 {'qid': 1085, 'rank': 3, 'score': 5700.134, 'numFound': 3},
 {'qid': 1086, 'rank': 2, 'score': 884.7329, 'numFound': 33},
 {'qid': 1087, 'rank': 1, 'score': 32100.479, 'numFound': 441},
 {'qid': 1088, 'rank': 10, 'score': 134.11969, 'numFound': 456},
 {'qid': 1089, 'rank': 1, 'score': 186.8241, 'numFound': 456},
 {'qid': 1090, 'rank': -2, 'score': -1, 'numFound': -1},
 {'qid': 1091, 'rank': 1, 'score': 0, 'numFound': 73},
 {'qid': 1092, 'rank': 2, 'score': 163214.86, 'numFound': 770},
 {'qid': 1093, 'rank': -2, 'score': -1, 'numFound': -1},
 {'qid': 1094, 'rank': 1, 'score': 0, 'numFound': 1},
 {'qid': 1095, 'rank': -2, 'score': -1, 'numFound': -1},
 {'qid': 1096, 'rank': -2, 'score': -1, 'numFound': -1},
 {'qid': 1097, 'rank': 5, 'score': 304.72314, 'numFound': 686},
 {'qid': 1098, 'rank': -2, 'score': -1, 'numFound': -1},
 {'qid': 1099, 'rank': -2, 'score': -1, 'numFound': -1},
 {'qid': 1100, 'rank': -2, 'score': -1, 'numFound': -1},
 {'qid': 1101, 'rank': 2, 'score': 13434.717, 'numFound': 25},
 {'qid': 1102, 'rank': 3, 'score': 42747.555, 'numFound': 1072},
 {'qid': 1103, 'rank': 1, 'score': 630983.3, 'numFound': 1072},
 {'qid': 1104, 'rank': 5, 'score': 187.9692, 'numFound': 1072},
 {'qid': 1105, 'rank': 1, 'score': 0, 'numFound': 36},
 {'qid': 1106, 'rank': 1, 'score': 0, 'numFound': 213},
 {'qid': 1107, 'rank': 1, 'score': 0, 'numFound': 591},
 {'qid': 1108, 'rank': 2, 'score': 3243.2876, 'numFound': 67},
 {'qid': 1109, 'rank': 2, 'score': 5882.3823, 'numFound': 28},
 {'qid': 1110, 'rank': 6, 'score': 2337.083, 'numFound': 28},
 {'qid': 1111, 'rank': 1, 'score': 544787.8, 'numFound': 3662},
 {'qid': 1112, 'rank': 1, 'score': 544787.8, 'numFound': 3662},
 {'qid': 1113, 'rank': 1, 'score': 1376.1748, 'numFound': 1376},
 {'qid': 1114, 'rank': -2, 'score': -1, 'numFound': -1},
 {'qid': 1115, 'rank': 1, 'score': 0, 'numFound': 380},
 {'qid': 1116, 'rank': -2, 'score': -1, 'numFound': -1},
 {'qid': 1117, 'rank': 2, 'score': 6239.474, 'numFound': 13},
 {'qid': 1118, 'rank': -2, 'score': -1, 'numFound': -1},
 {'qid': 1119, 'rank': -2, 'score': -1, 'numFound': -1},
 {'qid': 1120, 'rank': 2, 'score': 117581.87, 'numFound': 457},
 {'qid': 1121, 'rank': 3, 'score': 1328.2976, 'numFound': 45},
 {'qid': 1122, 'rank': 2, 'score': 1424.4967, 'numFound': 45},
 {'qid': 1123, 'rank': -2, 'score': -1, 'numFound': -1},
 {'qid': 1124, 'rank': 3, 'score': 10207.402, 'numFound': 471},
 {'qid': 1125, 'rank': 2, 'score': 101.749115, 'numFound': 266},
 {'qid': 1126, 'rank': 2, 'score': 0, 'numFound': 183},
 {'qid': 1127, 'rank': 1, 'score': 0, 'numFound': 108},
 {'qid': 1128, 'rank': 4, 'score': 367.2867, 'numFound': 561},
 {'qid': 1129, 'rank': 6, 'score': 183.68881, 'numFound': 561},
 {'qid': 1130, 'rank': -2, 'score': -1, 'numFound': -1},
 {'qid': 1131, 'rank': 6, 'score': 417.7389, 'numFound': 1313},
 {'qid': 1132, 'rank': 4, 'score': 771.4134, 'numFound': 1313},
 {'qid': 1133, 'rank': 3, 'score': 4947.2354, 'numFound': 1313},
 {'qid': 1134, 'rank': 3, 'score': 4095.2056, 'numFound': 63},
 {'qid': 1135, 'rank': 2, 'score': 25929.186, 'numFound': 63},
 {'qid': 1136, 'rank': 5, 'score': 35033.36, 'numFound': 724},
 {'qid': 1137, 'rank': -2, 'score': -1, 'numFound': -1},
 {'qid': 1138, 'rank': 4, 'score': 638.0653, 'numFound': 225},
 {'qid': 1139, 'rank': 3, 'score': 115.97895, 'numFound': 4},
 {'qid': 1140, 'rank': -2, 'score': -1, 'numFound': -1},
 {'qid': 1141, 'rank': 1, 'score': 0, 'numFound': 91},
 {'qid': 1142, 'rank': 2, 'score': 12420.592, 'numFound': 91},
 {'qid': 1143, 'rank': 1, 'score': 90362.26, 'numFound': 2679},
 {'qid': 1144, 'rank': 2, 'score': 223965.53, 'numFound': 1487},
 {'qid': 1145, 'rank': 2, 'score': 778718.9, 'numFound': 1487},
 {'qid': 1146, 'rank': 4, 'score': 1129.5156, 'numFound': 36},
 {'qid': 1147, 'rank': 2, 'score': 30795.426, 'numFound': 58},
 {'qid': 1148, 'rank': 2, 'score': 0, 'numFound': 184},
 {'qid': 1149, 'rank': 1, 'score': 112.24665, 'numFound': 405}]

Time taken for 1000 queries

In [183]:
#### DO NOT EXECUTE THIS ####
#%%time
#results = pool.map(execute_search, query_list[1050:2050])
CPU times: user 1.91 s, sys: 1.27 s, total: 3.18 s Wall time: 4min 54s

Report

In [67]:
from collections import Counter

Query Corpus also returns rank as input. We can calculate the position wise breakup of them as before the change.

In [44]:
before_change_ranks = [each.get('rank', 0) for each in query_data[1050:1150] if each['rank']]
In [46]:
counter = Counter(before_change_ranks)
In [74]:
result_ranks = [each.rank for each in results]
In [75]:
counter = Counter(result_ranks)
In [101]:
position_wise_summary = {str(key): value/len(result_ranks) for key, value in counter.items() if key <= 5 and key > 0};position_wise_summary
Out[101]:
{'4': 0.07, '1': 0.28, '5': 0.05, '2': 0.22, '3': 0.1}
In [102]:
total_top_5 = sum(position_wise_summary.values());round(total_top_5, 2)
Out[102]:
0.72
In [103]:
total_out_of_range = 1 - total_top_5; round(total_out_of_range, 2)
Out[103]:
0.28
In [104]:
def generate_report(position_wise_summary, total_top_5):
    with open('report.txt', 'w') as out:
        for key in sorted(position_wise_summary.keys()):
            out.write("#{}: {}%\n".format(key,round(position_wise_summary[key]*100), 2))
            
        out.write("Total Top 5 (in %):{}\n".format(round(total_top_5*100, 2)))
        out.write("Total Out of range (in %):{}\n".format(round((1-total_top_5)*100),2))
In [105]:
generate_report(position_wise_summary, total_top_5)

The sample report is as follows in report.csv

#1,#2,#3,#4,#5,top5,out_of_range
0.28,0.22,0.1,0.07,0.05,0.72,0.28
In [112]:
desired_format = {"#1": position_wise_summary['1'], "#2": position_wise_summary['2'], "#3": position_wise_summary['3'],
                 "#4": position_wise_summary['4'], "#5": position_wise_summary['5'], 'top5': total_top_5,
                  'out_of_range': 1-total_top_5}; desired_format
Out[112]:
{'#1': 0.28,
 '#2': 0.22,
 '#3': 0.1,
 '#4': 0.07,
 '#5': 0.05,
 'top5': 0.72,
 'out_of_range': 0.28}
In [117]:
import csv

with open('report.csv', mode='w') as csv_file:
    fieldnames = ['#1', '#2', '#3', '#4', '#5', 'top5', 'out_of_range']
    writer = csv.DictWriter(csv_file, fieldnames=fieldnames)

    writer.writeheader()
    writer.writerow(desired_format)
In [115]:
import jovian
In [78]:
#!pip install jovian
In [ ]:
jovian.commit()
[jovian] Saving notebook..
In [ ]: