# Preprocessing Text Data



In [1]:
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
import sqlite3
import pprint

conn = sqlite3.connect('twitter.db')
#conn.text_factory=lambda x: str(x, 'iso-8859-1')
conn.text_factory=lambda x: str(x, 'latin1')
curs = conn.cursor()

curs.execute('SELECT content FROM tweets LIMIT 10')

data = np.array(curs.fetchall()).flatten()

#pprint.pprint(data)

## Using Regex, TextBlob and NLTK 

* [TextBlob](https://textblob.readthedocs.io/en/dev/)
* [Regex](https://docs.python.org/3.3/howto/regex.html)
* [NLTK](https://www.nltk.org/)

In [4]:
from textblob import TextBlob
from textblob import Word
from stop_words import get_stop_words
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
import re

mystopwords = list(stopwords.words('english'))
mystopwords.extend(list(get_stop_words('en')))
# add your own stop words here
mystopwords.extend(['https','http'])

# regular expression to detect numbers and non-alphanumeric characters 
# a word
p = re.compile('[\d\W]')

wdata = []
for t in data:
    tb = TextBlob(t)
    
    # print the textblob sentiment analysis results
    print(tb.sentiment)
    wlist = []
    for w in tb.words:
        # add your own data cleaning code here
        # if numbers or non-alpha are found, ignore
        if p.search(w) != None:
            continue
        # if w is a stopword, ignore
        if w.lower() in mystopwords:
            continue
        wlist.append(w)
        #ww = Word(w)
        #print(ww.synsets)
    wdata.append(wlist)
    
pprint.pprint(wdata)

[nltk_data] Downloading package punkt to /Users/lipyeow/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/lipyeow/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/lipyeow/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
Sentiment(polarity=0.3333333333333333, subjectivity=1.0)
Sentiment(polarity=0.0, subjectivity=0.0)
Sentiment(polarity=-0.15, subjectivity=0.39999999999999997)
Sentiment(polarity=0.16818181818181818, subjectivity=0.32727272727272727)
Sentiment(polarity=0.0, subjectivity=0.0)
Sentiment(polarity=0.2, subjectivity=0.5)
Sentiment(polarity=0.3, subjectivity=0.762962962962963)
Sentiment(polarity=0.16468253968253968, subjectivity=0.5257936507936507)
Sentiment(polarity=0.0, subjectivity=0.0)
Sentiment(polarity=0.0, subjectivity=0.0)
[['Extraordinary',
  'evidence',
  'Treasury',
  'committee',
  'Jon',
  'Thompson

## Using sklearn

[Documentation](http://scikit-learn.org/stable/modules/feature_extraction.html#feature-extraction)

### Convert Each Document to Term Frequency Vectors

In [5]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()

X = vectorizer.fit_transform(data).toarray()

#pprint.pprint(vectorizer.get_feature_names())

pprint.pprint(X)
pprint.pprint(X.shape)

bigram_vectorizer = CountVectorizer(ngram_range=(1, 2),\
                                    token_pattern=r'\b\w+\b', min_df=1)

X2 = bigram_vectorizer.fit_transform(data).toarray()

pprint.pprint(X2)

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 1, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ..., 
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)
(10, 142)
array([[0, 0, 0, ..., 0, 0, 0],
       [0, 1, 1, ..., 0, 0, 0],
       [0, 0, 0, ..., 1, 1, 0],
       ..., 
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 1]], dtype=int64)


### Convert TF Vectors to TF.IDF Vectors



In [31]:
from sklearn.feature_extraction.text import TfidfTransformer

transformer = TfidfTransformer(smooth_idf=False)

tfidf = transformer.fit_transform(X).toarray()

pprint.pprint(tfidf)

array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.26446362,  0.        , ...,  0.26446362,
         0.        ,  0.        ],
       ..., 
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.26132862, ...,  0.        ,
         0.        ,  0.        ]])


You can also go from text to tfidf vectors in one step using

    from sklearn.feature_extraction.text import TfidfVectorizer

## Word Embeddings

* [CBOW & Skip-grams](https://www.geeksforgeeks.org/python-word-embedding-using-word2vec/) with python code for how to train your own word2vec model
* [Pretrained embeddings](http://ahogrammer.com/2017/01/20/the-list-of-pretrained-word-embeddings/)

The pretrained word2vec files are about 1-2GB compressed.

In [2]:
import gensim

# Load Google's pre-trained Word2Vec model.
model = gensim.models.KeyedVectors.load_word2vec_format(\
        './GoogleNews-vectors-negative300.bin', \
        binary=True)

pprint.pprint(model.most_similar(\
        positive=['woman', 'king'], negative=['man']))

[('queen', 0.7118192315101624),
 ('monarch', 0.6189674139022827),
 ('princess', 0.5902431011199951),
 ('crown_prince', 0.5499460697174072),
 ('prince', 0.5377321839332581),
 ('kings', 0.5236843824386597),
 ('Queen_Consort', 0.5235945582389832),
 ('queens', 0.5181134939193726),
 ('sultan', 0.5098593235015869),
 ('monarchy', 0.5087411999702454)]
