In [1]:
import re    # for regular expressions 
import nltk  # for text manipulation 
import string 
import warnings 
import numpy as np 
import pandas as pd 
import seaborn as sns 
import matplotlib.pyplot as plt  
In [2]:
train  = pd.read_csv('train_E6oV3lV.csv') 
test = pd.read_csv('test_tweets_anuFYb8.csv')
In [3]:
train[train['label'] == 0].head(10)
Out[3]:
id label tweet
0 1 0 @user when a father is dysfunctional and is s...
1 2 0 @user @user thanks for #lyft credit i can't us...
2 3 0 bihday your majesty
3 4 0 #model i love u take with u all the time in ...
4 5 0 factsguide: society now #motivation
5 6 0 [2/2] huge fan fare and big talking before the...
6 7 0 @user camping tomorrow @user @user @user @use...
7 8 0 the next school year is the year for exams.ðŸ˜...
8 9 0 we won!!! love the land!!! #allin #cavs #champ...
9 10 0 @user @user welcome here ! i'm it's so #gr...
In [4]:
train[train['label'] == 1].head(10)
Out[4]:
id label tweet
13 14 1 @user #cnn calls #michigan middle school 'buil...
14 15 1 no comment! in #australia #opkillingbay #se...
17 18 1 retweet if you agree!
23 24 1 @user @user lumpy says i am a . prove it lumpy.
34 35 1 it's unbelievable that in the 21st century we'...
56 57 1 @user lets fight against #love #peace
68 69 1 😩the white establishment can't have blk fol...
77 78 1 @user hey, white people: you can call people '...
82 83 1 how the #altright uses & insecurity to lu...
111 112 1 @user i'm not interested in a #linguistics tha...
In [5]:
train.shape, test.shape
Out[5]:
((31962, 3), (17197, 2))
In [6]:
train["label"].value_counts()
Out[6]:
0    29720
1     2242
Name: label, dtype: int64
In [7]:
length_train = train['tweet'].str.len() 
length_test = test['tweet'].str.len() 
plt.hist(length_train, bins=20, label="train_tweets") 
plt.hist(length_test, bins=20, label="test_tweets") 
plt.legend() 
plt.show()
In [8]:
combi = train.append(test, ignore_index=True) 
combi.shape
/Users/Parthmehta/anaconda3/lib/python3.7/site-packages/pandas/core/frame.py:6211: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.

To retain the current behavior and silence the warning, pass 'sort=True'.

  sort=sort)
Out[8]:
(49159, 3)
In [9]:
def remove_pattern(input_txt, pattern):
    r = re.findall(pattern, input_txt)
    for i in r:
        input_txt = re.sub(i, '', input_txt)
    return input_txt

combi['tidy_tweet'] = np.vectorize(remove_pattern)(combi['tweet'], "@[\w]*") 
combi.head()
Out[9]:
id label tweet tidy_tweet
0 1 0.0 @user when a father is dysfunctional and is s... when a father is dysfunctional and is so sel...
1 2 0.0 @user @user thanks for #lyft credit i can't us... thanks for #lyft credit i can't use cause th...
2 3 0.0 bihday your majesty bihday your majesty
3 4 0.0 #model i love u take with u all the time in ... #model i love u take with u all the time in ...
4 5 0.0 factsguide: society now #motivation factsguide: society now #motivation
In [10]:
combi['tidy_tweet'] = combi['tidy_tweet'].str.replace("[^a-zA-Z#]", " ") 
combi.head(10)
Out[10]:
id label tweet tidy_tweet
0 1 0.0 @user when a father is dysfunctional and is s... when a father is dysfunctional and is so sel...
1 2 0.0 @user @user thanks for #lyft credit i can't us... thanks for #lyft credit i can t use cause th...
2 3 0.0 bihday your majesty bihday your majesty
3 4 0.0 #model i love u take with u all the time in ... #model i love u take with u all the time in ...
4 5 0.0 factsguide: society now #motivation factsguide society now #motivation
5 6 0.0 [2/2] huge fan fare and big talking before the... huge fan fare and big talking before the...
6 7 0.0 @user camping tomorrow @user @user @user @use... camping tomorrow danny
7 8 0.0 the next school year is the year for exams.ðŸ˜... the next school year is the year for exams ...
8 9 0.0 we won!!! love the land!!! #allin #cavs #champ... we won love the land #allin #cavs #champ...
9 10 0.0 @user @user welcome here ! i'm it's so #gr... welcome here i m it s so #gr
In [11]:
combi['tidy_tweet'] = combi['tidy_tweet'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))
combi.head()
Out[11]:
id label tweet tidy_tweet
0 1 0.0 @user when a father is dysfunctional and is s... when father dysfunctional selfish drags kids i...
1 2 0.0 @user @user thanks for #lyft credit i can't us... thanks #lyft credit cause they offer wheelchai...
2 3 0.0 bihday your majesty bihday your majesty
3 4 0.0 #model i love u take with u all the time in ... #model love take with time
4 5 0.0 factsguide: society now #motivation factsguide society #motivation
In [12]:
tokenized_tweet = combi['tidy_tweet'].apply(lambda x: x.split()) # tokenizing 
tokenized_tweet.head()
Out[12]:
0    [when, father, dysfunctional, selfish, drags, ...
1    [thanks, #lyft, credit, cause, they, offer, wh...
2                              [bihday, your, majesty]
3                     [#model, love, take, with, time]
4                   [factsguide, society, #motivation]
Name: tidy_tweet, dtype: object
In [13]:
from nltk.stem.porter import * 
stemmer = PorterStemmer() 
tokenized_tweet = tokenized_tweet.apply(lambda x: [stemmer.stem(i) for i in x]) # stemming
In [14]:
for i in range(len(tokenized_tweet)):
    tokenized_tweet[i] = ' '.join(tokenized_tweet[i])    
combi['tidy_tweet'] = tokenized_tweet
In [15]:
combi.head()
Out[15]:
id label tweet tidy_tweet
0 1 0.0 @user when a father is dysfunctional and is s... when father dysfunct selfish drag kid into dys...
1 2 0.0 @user @user thanks for #lyft credit i can't us... thank #lyft credit caus they offer wheelchair ...
2 3 0.0 bihday your majesty bihday your majesti
3 4 0.0 #model i love u take with u all the time in ... #model love take with time
4 5 0.0 factsguide: society now #motivation factsguid societi #motiv
In [16]:
all_words = ' '.join([text for text in combi['tidy_tweet']]) 
from wordcloud import WordCloud 
wordcloud = WordCloud(width=800, height=500, random_state=21, max_font_size=110).generate(all_words) 
plt.figure(figsize=(10, 7)) 
plt.imshow(wordcloud, interpolation="bilinear") 
plt.axis('off') 
plt.show()
In [18]:
normal_words = ' '.join([text for text in combi['tidy_tweet'][combi['label'] == 0]]) 
from wordcloud import WordCloud 
wordcloud = WordCloud(width=800, height=500, random_state=21, max_font_size=110).generate(normal_words) 
plt.figure(figsize=(10, 7)) 
plt.imshow(wordcloud, interpolation="bilinear") 
plt.axis('off') 
plt.show()
In [19]:
negative_words = ' '.join([text for text in combi['tidy_tweet'][combi['label'] == 1]]) 
from wordcloud import WordCloud 
wordcloud = WordCloud(width=800, height=500, random_state=21, max_font_size=110).generate(negative_words) 
plt.figure(figsize=(10, 7)) 
plt.imshow(wordcloud, interpolation="bilinear") 
plt.axis('off') 
plt.show()
In [ ]:
# function to collect hashtags 
def hashtag_extract(x):    
    hashtags = []    
    # Loop over the words in the tweet    
    for i in x:        
        ht = re.findall(r"#(\w+)", i)        
        hashtags.append(ht)     
    return hashtags
    
# extracting hashtags from non racist/sexist tweets 
HT_regular = hashtag_extract(combi['tidy_tweet'][combi['label'] == 0]) 
# extracting hashtags from racist/sexist tweets 
HT_negative = hashtag_extract(combi['tidy_tweet'][combi['label'] == 1]) 
# unnesting list 
HT_regular = sum(HT_regular,[]) 
HT_negative = sum(HT_negative,[])
In [25]:
a = nltk.FreqDist(HT_regular) 
d = pd.DataFrame({'Hashtag': list(a.keys()),'Count': list(a.values())}) 
# selecting top 20 most frequent hashtags     
d = d.nlargest(columns="Count", n = 20) 
plt.figure(figsize=(16,5)) 
ax = sns.barplot(data=d, x= "Hashtag", y = "Count") 
ax.set(ylabel = 'Count') 
plt.show()
In [26]:
a = nltk.FreqDist(HT_negative) 
d = pd.DataFrame({'Hashtag': list(a.keys()),'Count': list(a.values())}) 
# selecting top 20 most frequent hashtags     
d = d.nlargest(columns="Count", n = 20) 
plt.figure(figsize=(16,5)) 
ax = sns.barplot(data=d, x= "Hashtag", y = "Count") 
ax.set(ylabel = 'Count') 
plt.show()
In [27]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer 
import gensim
In [28]:
bow_vectorizer = CountVectorizer(max_df=0.90, min_df=2, max_features=1000, stop_words='english') 
bow = bow_vectorizer.fit_transform(combi['tidy_tweet']) 
bow.shape
Out[28]:
(49159, 1000)
In [29]:
tokenized_tweet = combi['tidy_tweet'].apply(lambda x: x.split()) # tokenizing 
model_w2v = gensim.models.Word2Vec(
            tokenized_tweet,
            size=200, # desired no. of features/independent variables
            window=5, # context window size
            min_count=2,
            sg = 1, # 1 for skip-gram model
            hs = 0,
            negative = 10, # for negative sampling
            workers= 2, # no.of cores
            seed = 34) 

model_w2v.train(tokenized_tweet, total_examples= len(combi['tidy_tweet']), epochs=20)
Out[29]:
(6510028, 7536020)
In [31]:
model_w2v.wv.most_similar(positive="dinner")
Out[31]:
[('#avocado', 0.5617818832397461),
 ('noodl', 0.5614859461784363),
 ('spaghetti', 0.554806113243103),
 ('#cellar', 0.5474801063537598),
 ('melani', 0.546575665473938),
 ('spinach', 0.5453384518623352),
 ('cookout', 0.5447225570678711),
 ('#biall', 0.542272686958313),
 ('gown', 0.5421741008758545),
 ('fav', 0.5389130115509033)]
In [32]:
model_w2v.wv.most_similar(positive="trump")
Out[32]:
[('donald', 0.5605064034461975),
 ('hillari', 0.5432127118110657),
 ('phoni', 0.5253685712814331),
 ('melo', 0.523817777633667),
 ('unstabl', 0.5165114402770996),
 ('#delegaterevolt', 0.5160558223724365),
 ('unfit', 0.5129935145378113),
 ('tomlin', 0.5082054734230042),
 ('unfavor', 0.4998696744441986),
 ('jibe', 0.4985198676586151)]
In [37]:
def word_vector(tokens, size):
    vec = np.zeros(size).reshape((1, size))
    count = 0.
    for word in tokens:
        try:
            vec += model_w2v[word].reshape((1, size))
            count += 1.
        except KeyError: # handling the case where the token is not in vocabulary continue
            if count != 0:
                vec /= count
    return vec

wordvec_arrays = np.zeros((len(tokenized_tweet), 200)) 
for i in range(len(tokenized_tweet)):
    wordvec_arrays[i,:] = word_vector(tokenized_tweet[i], 200)
    wordvec_df = pd.DataFrame(wordvec_arrays) 
wordvec_df.shape    
/Users/Parthmehta/anaconda3/lib/python3.7/site-packages/ipykernel/__main__.py:6: DeprecationWarning: Call to deprecated `__getitem__` (Method will be removed in 4.0.0, use self.wv.__getitem__() instead).
Out[37]:
(49159, 200)
In [38]:
from sklearn.linear_model import LogisticRegression 
from sklearn.model_selection import train_test_split 
from sklearn.metrics import f1_score
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]: