Jovian
⭐️
Sign In

Kaggle_Competition:Predict_Disaster_Tweets

In [1]:
import numpy as np
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
In [2]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
In [3]:
train.head(2)
Out[3]:
In [4]:
test.head(2)
Out[4]:
In [5]:
a = "pankaj%20soni"
a.replace("%20","_")
Out[5]:
'pankaj_soni'
In [6]:
train.keyword = train.keyword.str.replace("%20","_")
test.keyword = test.keyword.str.replace("%20","_")
In [7]:
train.keyword.fillna(train.keyword.mode()[0],inplace=True)
test.keyword.fillna(test.keyword.mode()[0],inplace=True)
In [8]:
train.shape
Out[8]:
(7613, 5)
In [9]:
test.shape
Out[9]:
(3263, 4)
In [10]:
train.id.max()
Out[10]:
10873
In [11]:
train.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 7613 entries, 0 to 7612 Data columns (total 5 columns): id 7613 non-null int64 keyword 7613 non-null object location 5080 non-null object text 7613 non-null object target 7613 non-null int64 dtypes: int64(2), object(3) memory usage: 297.5+ KB
In [12]:
test.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 3263 entries, 0 to 3262 Data columns (total 4 columns): id 3263 non-null int64 keyword 3263 non-null object location 2158 non-null object text 3263 non-null object dtypes: int64(1), object(3) memory usage: 102.1+ KB
In [13]:
# plt.figure(figsize=(16,4))
# sns.countplot(train.keyword)
# plt.xticks(rotation=90)
# plt.show()
In [14]:
# train.keyword.unique().size
In [15]:
# train.keyword.mode()
In [16]:
# train.keyword.value_counts()
In [17]:
# train.location.unique().size
In [18]:
# train.isnull().any()
In [19]:
# train.isnull().sum()
In [20]:
train.text.head(5)
Out[20]:
0    Our Deeds are the Reason of this #earthquake M...
1               Forest fire near La Ronge Sask. Canada
2    All residents asked to 'shelter in place' are ...
3    13,000 people receive #wildfires evacuation or...
4    Just got sent this photo from Ruby #Alaska as ...
Name: text, dtype: object
remove non-word characters from text
In [21]:
import re
In [22]:
train.text[200]
Out[22]:
'HAPPENING NOW - HATZOLAH EMS AMBULANCE RESPONDING WITH DUAL SIRENS AND\x89Û_ https://t.co/SeK6MQ6NJF'
In [23]:
# for i in range(100,200):
#     print(train.text[i])
In [24]:
def textCleaning(text):
    text = re.sub("http(s)?://\w+\.\w+/\w+","",text)
    text = re.sub("[^\w\s]+","",text)
    text = re.sub("[\d]+","",text)
    text = re.sub("\s+"," ",text)
    text = re.sub("^\s","",text)
    text = re.sub("\s$","",text)
    #print(text)
    return(text)
textCleaning(train.text[92])
Out[24]:
'PM TRAFFIC ACCIDENT NO INJURY at WILLIS FOREMAN RD'
In [25]:
train.text = train.text.apply(textCleaning)
test.text = test.text.apply(textCleaning)
In [26]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import ConfusionMatrixDisplay,accuracy_score,roc_auc_score,f1_score
from sklearn.model_selection import train_test_split
In [33]:
n_gram = (1,4)
In [98]:
# cv = CountVectorizer(stop_words="english",ngram_range=n_gram,max_features=5000)
cv = CountVectorizer(ngram_range=n_gram)
In [99]:
X_train,X_test,y_train,y_test = train_test_split(train.text,train.target,test_size=.2,random_state=5)
X_train_dtm = cv.fit_transform(X_train).todense()
X_test_dtm = cv.transform(X_test).todense()
In [100]:
X_train_dtm.shape
Out[100]:
(6090, 174291)
stacking multiple Vectorized features
In [56]:
# X_train,X_test,y_train,y_test = train_test_split(train.text,train.target,test_size=.2,random_state=5)
train_text_dtm = cv.fit_transform(train.text).todense()
train_kw_dtm = cv.fit_transform(train.keyword).todense()
train_kw_text_dtm = np.hstack([train_text_dtm,train_kw_dtm])
In [57]:
train_kw_text_dtm.shape
Out[57]:
(7613, 5220)
In [58]:
X_train,X_test,y_train,y_test = train_test_split(train_kw_text_dtm,train.target,test_size=.2,random_state=5)
In [83]:
X_train_dtm = X_train_dtm.astype("int8")
y_train = y_train.astype("int8")

Using Gaussian Naive Bayes

In [207]:
# from sklearn.naive_bayes import GaussianNB,MultinomialNB
In [212]:
# nb = GaussianNB()
# nb.fit(X_train_dtm,y_train)
# y_pred = nb.predict(X_test_dtm)
# print("accuracy_score\t",round(accuracy_score(y_test,y_pred)*100))
# print("f1_score\t",round(f1_score(y_test,y_pred)*100))

Using Multinomial Naive Bayes

In [213]:
# mnb = MultinomialNB()
# mnb.fit(X_train_dtm,y_train)
# y_pred = mnb.predict(X_test_dtm)
# print("accuracy_score\t",round(accuracy_score(y_test,y_pred)*100))
# print("f1_score\t",round(f1_score(y_test,y_pred)*100))

Using Logistic Regression

In [60]:
from sklearn.linear_model import LogisticRegression
In [61]:
lr = LogisticRegression(class_weight="balanced")
lr.fit(X_train,y_train)
y_pred = lr.predict(X_test)
print("accuracy_score\t",round(accuracy_score(y_test,y_pred)*100))
print("f1_score\t",round(f1_score(y_test,y_pred)*100))
accuracy_score 78.0 f1_score 74.0

using LigthGBM

In [62]:
from lightgbm import LGBMClassifier
In [63]:
lgbm = LGBMClassifier(class_weight="balanced")
lgbm.fit(X_train_dtm,y_train)
y_pred = lgbm.predict(X_test_dtm)
print("accuracy_score\t",round(accuracy_score(y_test,y_pred)*100))
print("f1_score\t",round(f1_score(y_test,y_pred)*100))
accuracy_score 78.0 f1_score 74.0

Using tf-idf

In [ ]:
# tfidf = TfidfVectorizer(stop_words="english",ngram_range=n_gram)
# # tfidf = TfidfVectorizer(ngram_range=n_gram)
In [ ]:
# X_train_dtm = tfidf.fit_transform(X_train).toarray()
# X_test_dtm = tfidf.transform(X_test).toarray()
# X_train_dtm = X_train_dtm.astype("int8")
# y_train = y_train.astype("int8")
using NB
In [ ]:
# nb = GaussianNB()
# nb.fit(X_train_dtm,y_train)
# y_pred = nb.predict(X_test_dtm)
# accuracy_score(y_test,y_pred)
using LR
In [ ]:
# lr = LogisticRegression()
# lr.fit(X_train_dtm,y_train)
# y_pred = lr.predict(X_test_dtm)
# accuracy_score(y_test,y_pred)
In [ ]:
# from lightgbm import LGBMClassifier
In [ ]:
# lgbm = LGBMClassifier()
# lgbm.fit(X_train_dtm,y_train)
# y_pred = lgbm.predict(X_test_dtm)
# accuracy_score(y_test,y_pred)
Using catboost
In [ ]:
# from catboost import CatBoostClassifier
In [ ]:
# cb = CatBoostClassifier(leaf_estimation_iterations=12)
# cb.fit(X_train_dtm,y_train)
# y_pred = cb.predict(X_test_dtm)
# accuracy_score(y_test,y_pred)
Using xgboost
In [ ]:
# from xgboost import XGBClassifier
In [ ]:
# xgb = XGBClassifier(n_estimators=50)
# xgb.fit(X_train_dtm,y_train)
# y_pred = xgb.predict(X_test_dtm)
# accuracy_score(y_test,y_pred)

using Keras

In [115]:
import keras
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier
# del model
model = Sequential()
model.reset_states() # To clear the previous state of model
# # or use this one below
# keras.backend.clear_session()
In [116]:
model.add(Dense(units=512, activation='relu', input_shape=(174291,)))
model.add(Dense(units=256, activation='relu'))
model.add(Dense(units=128, activation='relu'))
model.add(Dense(units=2, activation='sigmoid'))
model.compile(loss='sparse_categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
# Note: sparse_categorical_crossentropy to be used when integers are used for categoty
# categorical_crossentropy to be used when one hot encoder are used for target labels (only 1s and 0s are there)
model.summary()
Model: "sequential_8" _________________________________________________________________ Layer (type) Output Shape Param # ================================================================= dense_37 (Dense) (None, 512) 89237504 _________________________________________________________________ dense_38 (Dense) (None, 256) 131328 _________________________________________________________________ dense_39 (Dense) (None, 128) 32896 _________________________________________________________________ dense_40 (Dense) (None, 2) 258 ================================================================= Total params: 89,401,986 Trainable params: 89,401,986 Non-trainable params: 0 _________________________________________________________________
In [117]:
# y_train.shape
H = model.fit(X_train_dtm,y_train,epochs=50,validation_data=(X_test_dtm,y_test))
# y_pred = model.predict(X_test)

# y_test_class = np.argmax(y_test,axis=1)
# y_pred_class = np.argmax(y_pred,axis=1)
Train on 6090 samples, validate on 1523 samples Epoch 1/50 6090/6090 [==============================] - 283s 46ms/step - loss: 0.5127 - accuracy: 0.7663 - val_loss: 0.4700 - val_accuracy: 0.7879 Epoch 2/50 6090/6090 [==============================] - 267s 44ms/step - loss: 0.1079 - accuracy: 0.9637 - val_loss: 0.6497 - val_accuracy: 0.7669 Epoch 3/50 6090/6090 [==============================] - 264s 43ms/step - loss: 0.0506 - accuracy: 0.9757 - val_loss: 0.7036 - val_accuracy: 0.7781 Epoch 4/50 6090/6090 [==============================] - 263s 43ms/step - loss: 0.0389 - accuracy: 0.9800 - val_loss: 0.7465 - val_accuracy: 0.7748 Epoch 5/50 6090/6090 [==============================] - 261s 43ms/step - loss: 0.0360 - accuracy: 0.9813 - val_loss: 0.8830 - val_accuracy: 0.7774 Epoch 6/50 6090/6090 [==============================] - 306s 50ms/step - loss: 0.0369 - accuracy: 0.9818 - val_loss: 0.8991 - val_accuracy: 0.7794 Epoch 7/50 6090/6090 [==============================] - 257s 42ms/step - loss: 0.0342 - accuracy: 0.9821 - val_loss: 0.8041 - val_accuracy: 0.7722 Epoch 8/50 6090/6090 [==============================] - 270s 44ms/step - loss: 0.0332 - accuracy: 0.9834 - val_loss: 0.9851 - val_accuracy: 0.7781 Epoch 9/50 6090/6090 [==============================] - 286s 47ms/step - loss: 0.0312 - accuracy: 0.9834 - val_loss: 0.9490 - val_accuracy: 0.7814 Epoch 10/50 6090/6090 [==============================] - 277s 45ms/step - loss: 0.0305 - accuracy: 0.9849 - val_loss: 1.0582 - val_accuracy: 0.7735 Epoch 11/50 6090/6090 [==============================] - 267s 44ms/step - loss: 0.0324 - accuracy: 0.9849 - val_loss: 0.8903 - val_accuracy: 0.7781 Epoch 12/50 6090/6090 [==============================] - 260s 43ms/step - loss: 0.0327 - accuracy: 0.9851 - val_loss: 1.0494 - val_accuracy: 0.7754 Epoch 13/50 6090/6090 [==============================] - 258s 42ms/step - loss: 0.0346 - accuracy: 0.9851 - val_loss: 0.8473 - val_accuracy: 0.7781 Epoch 14/50 6090/6090 [==============================] - 263s 43ms/step - loss: 0.0317 - accuracy: 0.9851 - val_loss: 0.8962 - val_accuracy: 0.7781 Epoch 15/50 6090/6090 [==============================] - 256s 42ms/step - loss: 0.0300 - accuracy: 0.9859 - val_loss: 1.1027 - val_accuracy: 0.7781 Epoch 16/50 6090/6090 [==============================] - 256s 42ms/step - loss: 0.0292 - accuracy: 0.9857 - val_loss: 1.1344 - val_accuracy: 0.7768 Epoch 17/50 6090/6090 [==============================] - 258s 42ms/step - loss: 0.0292 - accuracy: 0.9857 - val_loss: 1.0957 - val_accuracy: 0.7768 Epoch 18/50 6090/6090 [==============================] - 256s 42ms/step - loss: 0.0287 - accuracy: 0.9857 - val_loss: 1.1777 - val_accuracy: 0.7807 Epoch 19/50 6090/6090 [==============================] - 271s 45ms/step - loss: 0.0280 - accuracy: 0.9856 - val_loss: 1.2930 - val_accuracy: 0.7787 Epoch 20/50 6090/6090 [==============================] - 259s 43ms/step - loss: 0.0279 - accuracy: 0.9854 - val_loss: 1.3641 - val_accuracy: 0.7768 Epoch 21/50 6090/6090 [==============================] - 257s 42ms/step - loss: 0.0282 - accuracy: 0.9859 - val_loss: 1.1290 - val_accuracy: 0.7800 Epoch 22/50 6090/6090 [==============================] - 255s 42ms/step - loss: 0.0277 - accuracy: 0.9864 - val_loss: 1.2876 - val_accuracy: 0.7794 Epoch 23/50 6090/6090 [==============================] - 259s 42ms/step - loss: 0.0283 - accuracy: 0.9862 - val_loss: 1.0862 - val_accuracy: 0.7794 Epoch 24/50 6090/6090 [==============================] - 257s 42ms/step - loss: 0.0293 - accuracy: 0.9844 - val_loss: 1.1102 - val_accuracy: 0.7695 Epoch 25/50 6090/6090 [==============================] - 261s 43ms/step - loss: 0.0293 - accuracy: 0.9844 - val_loss: 1.1355 - val_accuracy: 0.7768 Epoch 26/50 6090/6090 [==============================] - 254s 42ms/step - loss: 0.0283 - accuracy: 0.9844 - val_loss: 1.4622 - val_accuracy: 0.7735 Epoch 27/50 6090/6090 [==============================] - 253s 42ms/step - loss: 0.0326 - accuracy: 0.9844 - val_loss: 1.0943 - val_accuracy: 0.7676 Epoch 28/50 6090/6090 [==============================] - 295s 48ms/step - loss: 0.0290 - accuracy: 0.9847 - val_loss: 1.4823 - val_accuracy: 0.7728 Epoch 29/50 6090/6090 [==============================] - 256s 42ms/step - loss: 0.0335 - accuracy: 0.9833 - val_loss: 1.2266 - val_accuracy: 0.7702 Epoch 30/50 6090/6090 [==============================] - 254s 42ms/step - loss: 0.0337 - accuracy: 0.9841 - val_loss: 0.9586 - val_accuracy: 0.7656 Epoch 31/50 6090/6090 [==============================] - 257s 42ms/step - loss: 0.0379 - accuracy: 0.9821 - val_loss: 0.8539 - val_accuracy: 0.7774 Epoch 32/50 6090/6090 [==============================] - 273s 45ms/step - loss: 0.0378 - accuracy: 0.9810 - val_loss: 0.8056 - val_accuracy: 0.7715 Epoch 33/50 6090/6090 [==============================] - 255s 42ms/step - loss: 0.0389 - accuracy: 0.9823 - val_loss: 0.6506 - val_accuracy: 0.7682 Epoch 34/50 6090/6090 [==============================] - 254s 42ms/step - loss: 0.0329 - accuracy: 0.9837 - val_loss: 0.7655 - val_accuracy: 0.7630 Epoch 35/50 6090/6090 [==============================] - 258s 42ms/step - loss: 0.0298 - accuracy: 0.9831 - val_loss: 0.8209 - val_accuracy: 0.7702 Epoch 36/50 6090/6090 [==============================] - 262s 43ms/step - loss: 0.0268 - accuracy: 0.9829 - val_loss: 1.0150 - val_accuracy: 0.7676 Epoch 37/50 6090/6090 [==============================] - 252s 41ms/step - loss: 0.0297 - accuracy: 0.9833 - val_loss: 0.8822 - val_accuracy: 0.7669 Epoch 38/50 6090/6090 [==============================] - 256s 42ms/step - loss: 0.0257 - accuracy: 0.9833 - val_loss: 0.9971 - val_accuracy: 0.7630 Epoch 39/50 6090/6090 [==============================] - 264s 43ms/step - loss: 0.0265 - accuracy: 0.9854 - val_loss: 1.0108 - val_accuracy: 0.7630 Epoch 40/50 6090/6090 [==============================] - 255s 42ms/step - loss: 0.0281 - accuracy: 0.9856 - val_loss: 1.0631 - val_accuracy: 0.7663 Epoch 41/50 6090/6090 [==============================] - 255s 42ms/step - loss: 0.0277 - accuracy: 0.9860 - val_loss: 0.9735 - val_accuracy: 0.7367 Epoch 42/50 6090/6090 [==============================] - 256s 42ms/step - loss: 0.0357 - accuracy: 0.9847 - val_loss: 0.9636 - val_accuracy: 0.7590 Epoch 43/50 6090/6090 [==============================] - 259s 43ms/step - loss: 0.0256 - accuracy: 0.9864 - val_loss: 1.0202 - val_accuracy: 0.7479 Epoch 44/50 6090/6090 [==============================] - 256s 42ms/step - loss: 0.0250 - accuracy: 0.9864 - val_loss: 0.9628 - val_accuracy: 0.7433 Epoch 45/50 6090/6090 [==============================] - 256s 42ms/step - loss: 0.0264 - accuracy: 0.9860 - val_loss: 0.9018 - val_accuracy: 0.7229 Epoch 46/50 6090/6090 [==============================] - 256s 42ms/step - loss: 0.0254 - accuracy: 0.9847 - val_loss: 0.9766 - val_accuracy: 0.7603 Epoch 47/50 6090/6090 [==============================] - 260s 43ms/step - loss: 0.0248 - accuracy: 0.9851 - val_loss: 1.0326 - val_accuracy: 0.7466 Epoch 48/50 6090/6090 [==============================] - 255s 42ms/step - loss: 0.0248 - accuracy: 0.9852 - val_loss: 1.1273 - val_accuracy: 0.7557 Epoch 49/50 6090/6090 [==============================] - 255s 42ms/step - loss: 0.0251 - accuracy: 0.9854 - val_loss: 1.0054 - val_accuracy: 0.7426 Epoch 50/50 6090/6090 [==============================] - 259s 43ms/step - loss: 0.0245 - accuracy: 0.9859 - val_loss: 1.1545 - val_accuracy: 0.7472
In [118]:
y_pred = model.predict(X_test_dtm)

y_test_class = np.argmax(y_test,axis=1)
y_pred_class = np.argmax(y_pred,axis=1)
C:\Users\psoni\AppData\Roaming\Python\Python37\site-packages\numpy\core\fromnumeric.py:61: FutureWarning: The current behaviour of 'Series.argmax' is deprecated, use 'idxmax' instead. The behavior of 'argmax' will be corrected to return the positional maximum in the future. For now, use 'series.values.argmax' or 'np.argmax(np.array(values))' to get the position of the maximum row. return bound(*args, **kwds)
In [119]:
model.predict(X_test_dtm)
Out[119]:
array([[0.01138377, 0.9992344 ],
       [0.04089662, 0.9939358 ],
       [0.9814341 , 0.11882296],
       ...,
       [0.00738016, 0.9997079 ],
       [0.13662584, 0.9303428 ],
       [0.9997199 , 0.00991459]], dtype=float32)
In [120]:
from sklearn.metrics import classification_report,confusion_matrix
print(classification_report(y_test,y_pred_class))
print(confusion_matrix(y_test,y_pred_class))
print(accuracy_score(y_test,y_pred_class))
precision recall f1-score support 0 0.81 0.72 0.76 866 1 0.68 0.78 0.73 657 accuracy 0.75 1523 macro avg 0.75 0.75 0.75 1523 weighted avg 0.76 0.75 0.75 1523 [[623 243] [142 515]] 0.747209455022981
In [121]:
f1_score(y_test,y_pred_class)
Out[121]:
0.7279151943462897
In [122]:
sns.lineplot(H.epoch,H.history["loss"])
sns.lineplot(H.epoch,H.history["accuracy"])
Out[122]:
<matplotlib.axes._subplots.AxesSubplot at 0x19b8bd76948>
Notebook Image
In [123]:
sns.lineplot(H.epoch,H.history["val_loss"])
sns.lineplot(H.epoch,H.history["val_accuracy"])
Out[123]:
<matplotlib.axes._subplots.AxesSubplot at 0x19b8d3612c8>
Notebook Image