import scattertext as ST
import tarfile, urllib, io
import pandas as pd
from IPython.display import IFrame
'''From Bo Pang's website: https://www.cs.cornell.edu/people/pabo/movie-review-data/
Data from:
A Sentimental Education: Sentiment Analysis Using Subjectivity Summarization
Based on Minimum Cuts'', Proceedings of the ACL, 2004
Read remote tarball
'''
SUBJECTIVITY_URL = 'http://www.cs.cornell.edu/people/pabo/movie-review-data/rotten_imdb.tar.gz'
data = io.BytesIO(urllib.request.urlopen(SUBJECTIVITY_URL).read())
tarball = tarfile.open(fileobj=data, mode = 'r:gz')
readme = tarball.extractfile('subjdata.README.1.0').read()
quote = tarball.extractfile('quote.tok.gt9.5000').read()
plot = tarball.extractfile('plot.tok.gt9.5000').read()
# Examples of subjective sentences in corpus
quote.decode('utf-8', errors='ignore').split('\n')[:3]
'''Construct subjective vs. objective pandas dataframe,
treating review quotes as subjective, and plot points as objective.
'''
df = pd.DataFrame(
[{'text': text.strip(), 'label': 'subjective'} for text
in quote.decode('utf-8', errors='ignore').split('\n')]
+ [{'text': text.strip(), 'label': 'objective'} for text
in plot.decode('utf-8', errors='ignore').split('\n')]
)
'''Convert Pandas dataframe to a term-document matrix, indicating
the category column is "label" and the text column name is "text".'''
term_doc_mat = ST.TermDocMatrixFromPandas(data_frame = df,
category_col = 'label',
text_col = 'text',
# Note: use nlp=spacy.en.English() for text that's not pre-tokenized
nlp = ST.fast_but_crap_nlp
).build()
'''
Filter out bigrams with PMI < 3, and unigrams and bigrams that occur less than 20 times.
The variable html is a string containing the HTML that makes up the scattertext visualization
'''
html = ST.produce_scattertext_html(term_doc_mat,
category='subjective',
category_name='Subjective Term Frequency',
not_category_name='Objective Term Frequency',
protocol='https',
pmi_filter_thresold=3,
minimum_term_frequency=20)
# Hack to display HTML with D3 in Jupyter Notebook
open('subj_obj_scatter.html', 'wb').write(html.encode('utf-8'))
IFrame(src='subj_obj_scatter.html', width = 1000, height=1000)
''' Display top 20 terms that are characteristic of a subjective document-label and their frequencies.
'''
term_freq_df = term_doc_mat.get_term_freq_df()
term_freq_df['Subjective Score'] = term_doc_mat.get_scaled_f_scores('subjective', scaler_algo='percentile')
term_freq_df = term_freq_df.sort_values(by='Subjective Score', ascending=False)
term_freq_df.iloc[:20]
''' Display unigrams most characteristic of corpus against all of English that aren't unique to it.
Note: "doesn", "isn", and "didn" are a result of the pre-tokenization of the corpus.
'''
characteristic_terms = term_doc_mat.get_posterior_mean_ratio_scores_vs_background()
characteristic_terms[characteristic_terms['background'] > 0].iloc[:20]