In [1]:
import scattertext as ST
import tarfile, urllib, io
import pandas as pd
from IPython.display import IFrame
In [3]:
'''From Bo Pang's website: https://www.cs.cornell.edu/people/pabo/movie-review-data/

Data from:
A Sentimental Education: Sentiment Analysis Using Subjectivity Summarization 
Based on Minimum Cuts'', Proceedings of the ACL, 2004

Read remote tarball
'''
SUBJECTIVITY_URL = 'http://www.cs.cornell.edu/people/pabo/movie-review-data/rotten_imdb.tar.gz'
data = io.BytesIO(urllib.request.urlopen(SUBJECTIVITY_URL).read())
tarball = tarfile.open(fileobj=data, mode = 'r:gz')
readme = tarball.extractfile('subjdata.README.1.0').read()
quote = tarball.extractfile('quote.tok.gt9.5000').read()
plot = tarball.extractfile('plot.tok.gt9.5000').read()
In [4]:
# Examples of subjective sentences in corpus
quote.decode('utf-8', errors='ignore').split('\n')[:3]
Out[4]:
['smart and alert , thirteen conversations about one thing is a small gem . ',
 'color , musical bounce and warm seas lapping on island shores . and just enough science to send you home thinking . ',
 'it is not a mass-market entertainment but an uncompromising attempt by one artist to think about another . ']
In [5]:
'''Construct subjective vs. objective pandas dataframe, 
treating review quotes as subjective, and plot points as objective.
'''
df = pd.DataFrame(
    [{'text': text.strip(), 'label': 'subjective'} for text 
     in quote.decode('utf-8', errors='ignore').split('\n')] 
    + [{'text': text.strip(), 'label': 'objective'} for text 
       in plot.decode('utf-8', errors='ignore').split('\n')]
)
In [6]:
'''Convert Pandas dataframe to a term-document matrix, indicating
the category column is "label" and the text column name is "text".'''


term_doc_mat = ST.TermDocMatrixFromPandas(data_frame = df, 
                                          category_col = 'label', 
                                          text_col = 'text',
                                          # Note: use nlp=spacy.en.English() for text that's not pre-tokenized
                                          nlp = ST.fast_but_crap_nlp 
                                         ).build()
In [10]:
'''
Filter out bigrams with PMI < 3, and unigrams and bigrams that occur less than 20 times.  
The variable html is a string containing the HTML that makes up the scattertext visualization
'''
html = ST.produce_scattertext_html(term_doc_mat, 
                                       category='subjective', 
                                       category_name='Subjective Term Frequency', 
                                       not_category_name='Objective Term Frequency',
                                       protocol='https',
                                       pmi_filter_thresold=3,
                                       minimum_term_frequency=20)

# Hack to display HTML with D3 in Jupyter Notebook
open('subj_obj_scatter.html', 'wb').write(html.encode('utf-8'))
IFrame(src='subj_obj_scatter.html', width = 1000, height=1000)
Out[10]:
In [38]:
''' Display top 20 terms that are characteristic of a subjective document-label and their frequencies.
'''
term_freq_df = term_doc_mat.get_term_freq_df()
term_freq_df['Subjective Score'] = term_doc_mat.get_scaled_f_scores('subjective', scaler_algo='percentile')
term_freq_df = term_freq_df.sort_values(by='Subjective Score', ascending=False)
term_freq_df.iloc[:20]
Out[38]:
objective freq subjective freq Subjective Score
term
movie that 0 75 0.803250
entertaining 2 73 0.771629
film s 2 69 0.767533
but it 6 157 0.766663
i 13 275 0.755910
interesting 3 70 0.752203
film that 4 77 0.744846
performances 5 89 0.742972
of its 6 103 0.742011
in its 5 84 0.737945
me 2 51 0.737812
script 4 71 0.736981
movie is 5 83 0.736840
if you 6 96 0.736319
fascinating 2 48 0.730420
cinematic 2 47 0.727758
funny 9 126 0.726650
laughs 0 30 0.725776
movie s 0 30 0.725776
you re 4 64 0.725331
In [11]:
''' Display unigrams most characteristic of corpus against all of English that aren't unique to it.

Note: "doesn", "isn", and "didn" are a result of the pre-tokenization of the corpus.
'''
characteristic_terms = term_doc_mat.get_posterior_mean_ratio_scores_vs_background()
characteristic_terms[characteristic_terms['background'] > 0].iloc[:20]
Out[11]:
corpus background Log Posterior Mean Ratio
doesn 176.0 1101832.0 6.972770
isn 125.0 1345149.0 6.392687
discovers 70.0 1974534.0 5.356073
cinematic 49.0 1255895.0 5.091466
filmmaker 51.0 1493747.0 5.063639
cannot 29.0 88737.0 4.860555
filmmaking 37.0 1061519.0 4.768377
thriller 78.0 5364843.0 4.722203
didn 32.0 850882.0 4.648173
filmmakers 39.0 1657073.0 4.629892
comedy 229.0 22993280.0 4.591236
quirky 35.0 1436076.0 4.553131
documentary 113.0 10429008.0 4.547708
film 1006.0 116097842.0 4.512189
entertaining 75.0 6330073.0 4.503101
mysterious 65.0 5252752.0 4.483029
decides 58.0 4588774.0 4.447191
performances 94.0 9272429.0 4.417802
learns 40.0 2570984.0 4.390325
hasn 20.0 76625.0 4.352190
In [ ]: