import re    # for regular expressions 
import nltk  # for text manipulation 
import string 
import warnings 
import numpy as np 
import pandas as pd 
import seaborn as sns 
import matplotlib.pyplot as plt

train  = pd.read_csv('train_E6oV3lV.csv') 
test = pd.read_csv('test_tweets_anuFYb8.csv')

train[train['label'] == 0].head(10)

train[train['label'] == 1].head(10)

train.shape, test.shape

((31962, 3), (17197, 2))

train["label"].value_counts()

0    29720
1     2242
Name: label, dtype: int64

length_train = train['tweet'].str.len() 
length_test = test['tweet'].str.len() 
plt.hist(length_train, bins=20, label="train_tweets") 
plt.hist(length_test, bins=20, label="test_tweets") 
plt.legend() 
plt.show()

combi = train.append(test, ignore_index=True) 
combi.shape

/Users/Parthmehta/anaconda3/lib/python3.7/site-packages/pandas/core/frame.py:6211: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.

To retain the current behavior and silence the warning, pass 'sort=True'.

  sort=sort)

(49159, 3)

def remove_pattern(input_txt, pattern):
    r = re.findall(pattern, input_txt)
    for i in r:
        input_txt = re.sub(i, '', input_txt)
    return input_txt

combi['tidy_tweet'] = np.vectorize(remove_pattern)(combi['tweet'], "@[\w]*") 
combi.head()

combi['tidy_tweet'] = combi['tidy_tweet'].str.replace("[^a-zA-Z#]", " ") 
combi.head(10)

combi['tidy_tweet'] = combi['tidy_tweet'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))
combi.head()

tokenized_tweet = combi['tidy_tweet'].apply(lambda x: x.split()) # tokenizing 
tokenized_tweet.head()

0    [when, father, dysfunctional, selfish, drags, ...
1    [thanks, #lyft, credit, cause, they, offer, wh...
2                              [bihday, your, majesty]
3                     [#model, love, take, with, time]
4                   [factsguide, society, #motivation]
Name: tidy_tweet, dtype: object

from nltk.stem.porter import * 
stemmer = PorterStemmer() 
tokenized_tweet = tokenized_tweet.apply(lambda x: [stemmer.stem(i) for i in x]) # stemming

for i in range(len(tokenized_tweet)):
    tokenized_tweet[i] = ' '.join(tokenized_tweet[i])    
combi['tidy_tweet'] = tokenized_tweet

combi.head()

all_words = ' '.join([text for text in combi['tidy_tweet']]) 
from wordcloud import WordCloud 
wordcloud = WordCloud(width=800, height=500, random_state=21, max_font_size=110).generate(all_words) 
plt.figure(figsize=(10, 7)) 
plt.imshow(wordcloud, interpolation="bilinear") 
plt.axis('off') 
plt.show()

normal_words = ' '.join([text for text in combi['tidy_tweet'][combi['label'] == 0]]) 
from wordcloud import WordCloud 
wordcloud = WordCloud(width=800, height=500, random_state=21, max_font_size=110).generate(normal_words) 
plt.figure(figsize=(10, 7)) 
plt.imshow(wordcloud, interpolation="bilinear") 
plt.axis('off') 
plt.show()

negative_words = ' '.join([text for text in combi['tidy_tweet'][combi['label'] == 1]]) 
from wordcloud import WordCloud 
wordcloud = WordCloud(width=800, height=500, random_state=21, max_font_size=110).generate(negative_words) 
plt.figure(figsize=(10, 7)) 
plt.imshow(wordcloud, interpolation="bilinear") 
plt.axis('off') 
plt.show()

# function to collect hashtags 
def hashtag_extract(x):    
    hashtags = []    
    # Loop over the words in the tweet    
    for i in x:        
        ht = re.findall(r"#(\w+)", i)        
        hashtags.append(ht)     
    return hashtags
    
# extracting hashtags from non racist/sexist tweets 
HT_regular = hashtag_extract(combi['tidy_tweet'][combi['label'] == 0]) 
# extracting hashtags from racist/sexist tweets 
HT_negative = hashtag_extract(combi['tidy_tweet'][combi['label'] == 1]) 
# unnesting list 
HT_regular = sum(HT_regular,[]) 
HT_negative = sum(HT_negative,[])

a = nltk.FreqDist(HT_regular) 
d = pd.DataFrame({'Hashtag': list(a.keys()),'Count': list(a.values())}) 
# selecting top 20 most frequent hashtags     
d = d.nlargest(columns="Count", n = 20) 
plt.figure(figsize=(16,5)) 
ax = sns.barplot(data=d, x= "Hashtag", y = "Count") 
ax.set(ylabel = 'Count') 
plt.show()

a = nltk.FreqDist(HT_negative) 
d = pd.DataFrame({'Hashtag': list(a.keys()),'Count': list(a.values())}) 
# selecting top 20 most frequent hashtags     
d = d.nlargest(columns="Count", n = 20) 
plt.figure(figsize=(16,5)) 
ax = sns.barplot(data=d, x= "Hashtag", y = "Count") 
ax.set(ylabel = 'Count') 
plt.show()

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer 
import gensim

bow_vectorizer = CountVectorizer(max_df=0.90, min_df=2, max_features=1000, stop_words='english') 
bow = bow_vectorizer.fit_transform(combi['tidy_tweet']) 
bow.shape

(49159, 1000)

tokenized_tweet = combi['tidy_tweet'].apply(lambda x: x.split()) # tokenizing 
model_w2v = gensim.models.Word2Vec(
            tokenized_tweet,
            size=200, # desired no. of features/independent variables
            window=5, # context window size
            min_count=2,
            sg = 1, # 1 for skip-gram model
            hs = 0,
            negative = 10, # for negative sampling
            workers= 2, # no.of cores
            seed = 34) 

model_w2v.train(tokenized_tweet, total_examples= len(combi['tidy_tweet']), epochs=20)

(6510028, 7536020)

model_w2v.wv.most_similar(positive="dinner")

[('#avocado', 0.5617818832397461),
 ('noodl', 0.5614859461784363),
 ('spaghetti', 0.554806113243103),
 ('#cellar', 0.5474801063537598),
 ('melani', 0.546575665473938),
 ('spinach', 0.5453384518623352),
 ('cookout', 0.5447225570678711),
 ('#biall', 0.542272686958313),
 ('gown', 0.5421741008758545),
 ('fav', 0.5389130115509033)]

model_w2v.wv.most_similar(positive="trump")

[('donald', 0.5605064034461975),
 ('hillari', 0.5432127118110657),
 ('phoni', 0.5253685712814331),
 ('melo', 0.523817777633667),
 ('unstabl', 0.5165114402770996),
 ('#delegaterevolt', 0.5160558223724365),
 ('unfit', 0.5129935145378113),
 ('tomlin', 0.5082054734230042),
 ('unfavor', 0.4998696744441986),
 ('jibe', 0.4985198676586151)]

def word_vector(tokens, size):
    vec = np.zeros(size).reshape((1, size))
    count = 0.
    for word in tokens:
        try:
            vec += model_w2v[word].reshape((1, size))
            count += 1.
        except KeyError: # handling the case where the token is not in vocabulary continue
            if count != 0:
                vec /= count
    return vec

wordvec_arrays = np.zeros((len(tokenized_tweet), 200)) 
for i in range(len(tokenized_tweet)):
    wordvec_arrays[i,:] = word_vector(tokenized_tweet[i], 200)
    wordvec_df = pd.DataFrame(wordvec_arrays) 
wordvec_df.shape

/Users/Parthmehta/anaconda3/lib/python3.7/site-packages/ipykernel/__main__.py:6: DeprecationWarning: Call to deprecated `__getitem__` (Method will be removed in 4.0.0, use self.wv.__getitem__() instead).

(49159, 200)

from sklearn.linear_model import LogisticRegression 
from sklearn.model_selection import train_test_split 
from sklearn.metrics import f1_score

	id	tweet
0	1	@user when a father is dysfunctional and is s...
1	2	@user @user thanks for #lyft credit i can't us...
2	3	bihday your majesty
3	4	#model i love u take with u all the time in ...
4	5	factsguide: society now #motivation
5	6	[2/2] huge fan fare and big talking before the...
6	7	@user camping tomorrow @user @user @user @use...
7	8	the next school year is the year for exams.ð...
8	9	we won!!! love the land!!! #allin #cavs #champ...
9	10	@user @user welcome here ! i'm it's so #gr...

	id	tweet	tidy_tweet
0	1	@user when a father is dysfunctional and is s...	when a father is dysfunctional and is so sel...
1	2	@user @user thanks for #lyft credit i can't us...	thanks for #lyft credit i can't use cause th...
2	3	bihday your majesty	bihday your majesty
3	4	#model i love u take with u all the time in ...	#model i love u take with u all the time in ...
4	5	factsguide: society now #motivation	factsguide: society now #motivation

	id	tweet	tidy_tweet
0	1	@user when a father is dysfunctional and is s...	when a father is dysfunctional and is so sel...
1	2	@user @user thanks for #lyft credit i can't us...	thanks for #lyft credit i can t use cause th...
2	3	bihday your majesty	bihday your majesty
3	4	#model i love u take with u all the time in ...	#model i love u take with u all the time in ...
4	5	factsguide: society now #motivation	factsguide society now #motivation
5	6	[2/2] huge fan fare and big talking before the...	huge fan fare and big talking before the...
6	7	@user camping tomorrow @user @user @user @use...	camping tomorrow danny
7	8	the next school year is the year for exams.ð...	the next school year is the year for exams ...
8	9	we won!!! love the land!!! #allin #cavs #champ...	we won love the land #allin #cavs #champ...
9	10	@user @user welcome here ! i'm it's so #gr...	welcome here i m it s so #gr

	id	tweet	tidy_tweet
0	1	@user when a father is dysfunctional and is s...	when father dysfunctional selfish drags kids i...
1	2	@user @user thanks for #lyft credit i can't us...	thanks #lyft credit cause they offer wheelchai...
2	3	bihday your majesty	bihday your majesty
3	4	#model i love u take with u all the time in ...	#model love take with time
4	5	factsguide: society now #motivation	factsguide society #motivation

	id	tweet	tidy_tweet
0	1	@user when a father is dysfunctional and is s...	when father dysfunct selfish drag kid into dys...
1	2	@user @user thanks for #lyft credit i can't us...	thank #lyft credit caus they offer wheelchair ...
2	3	bihday your majesty	bihday your majesti
3	4	#model i love u take with u all the time in ...	#model love take with time
4	5	factsguide: society now #motivation	factsguid societi #motiv

	id	label	tweet
13	14	1	@user #cnn calls #michigan middle school 'buil...
14	15	1	no comment! in #australia #opkillingbay #se...
17	18	1	retweet if you agree!
23	24	1	@user @user lumpy says i am a . prove it lumpy.
34	35	1	it's unbelievable that in the 21st century we'...
56	57	1	@user lets fight against #love #peace
68	69	1	ð©the white establishment can't have blk fol...
77	78	1	@user hey, white people: you can call people '...
82	83	1	how the #altright uses & insecurity to lu...
111	112	1	@user i'm not interested in a #linguistics tha...