Jovian
⭐️
Sign In
In [1]:
import pandas as pd
In [2]:
test_data = pd.read_csv(r'..\data\drugsComTest_raw.csv')
In [3]:
test_data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 53766 entries, 0 to 53765 Data columns (total 7 columns): uniqueID 53766 non-null int64 drugName 53766 non-null object condition 53471 non-null object review 53766 non-null object rating 53766 non-null int64 date 53766 non-null object usefulCount 53766 non-null int64 dtypes: int64(3), object(4) memory usage: 2.9+ MB
In [10]:
cond1= test_data['drugName'] == 'Bupropion'
cond2= test_data['drugName'] == 'Sertraline'
In [31]:
f_testdf = test_data[cond1 | cond2]
In [35]:
f_testdf = f_testdf[f_testdf.condition=='Depression']
In [15]:
train_data = pd.read_csv(r'..\data\drugsComTrain_raw.csv')
In [16]:
train_data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 161297 entries, 0 to 161296 Data columns (total 7 columns): uniqueID 161297 non-null int64 drugName 161297 non-null object condition 160398 non-null object review 161297 non-null object rating 161297 non-null int64 date 161297 non-null object usefulCount 161297 non-null int64 dtypes: int64(3), object(4) memory usage: 8.6+ MB
In [6]:
train_data.drugName[train_data.condition=='Depression'].value_counts()
Out[6]:
Bupropion         549
Sertraline        459
Venlafaxine       437
Pristiq           418
Desvenlafaxine    414
                 ... 
Maprotiline         1
Vyvanse             1
Vivactil            1
Asendin             1
Luvox CR            1
Name: drugName, Length: 105, dtype: int64
In [32]:
cond3= train_data['drugName'] == 'Bupropion'
cond4= train_data['drugName'] == 'Sertraline'
f_traindf = train_data[cond3 | cond4]
f_traindf.head()
Out[32]:
In [40]:
f_traindf = f_traindf[f_traindf.condition=='Depression']
In [41]:
df = pd.concat([f_traindf,f_testdf])
In [42]:
df.head()
Out[42]:
In [43]:
len(f_traindf)
Out[43]:
1008
In [44]:
f_traindf.condition[f_traindf.drugName == 'Sertraline'].value_counts()
Out[44]:
Depression    459
Name: condition, dtype: int64
In [26]:
f_traindf.condition[f_traindf.drugName == 'Bupropion'].value_counts()
Out[26]:
Depression                                   549
Smoking Cessation                            147
Major Depressive Disorde                     114
ADHD                                          58
Anxiety                                       55
Bipolar Disorde                               32
Sexual Dysfunction, SSRI Induced              27
Seasonal Affective Disorde                    11
Panic Disorde                                  7
Obesity                                        5
Not Listed / Othe                              4
Persistent Depressive Disorde                  3
Premenstrual Dysphoric Disorde                 2
Postural Orthostatic Tachycardia Syndrome      2
Migraine Prevention                            1
Name: condition, dtype: int64
In [45]:
df.condition.value_counts()
Out[45]:
Depression    1410
Name: condition, dtype: int64
In [46]:
df.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 1410 entries, 31 to 53580 Data columns (total 7 columns): uniqueID 1410 non-null int64 drugName 1410 non-null object condition 1410 non-null object review 1410 non-null object rating 1410 non-null int64 date 1410 non-null object usefulCount 1410 non-null int64 dtypes: int64(3), object(4) memory usage: 88.1+ KB
In [52]:
df.to_csv(r'..\data\drug_review.csv',index=False)
In [54]:
new_df = pd.read_csv(r'..\data\drug_review.csv')
new_df.head()
Out[54]:
In [ ]: