%matplotlib inline
from sklearn.datasets import fetch_20newsgroups
import re, warnings
import scattertext as ST
from pandas.core.common import SettingWithCopyWarning
import spacy
import mpld3
import matplotlib.pyplot as plt
mpld3.enable_notebook()
warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)
nlp = spacy.en.English()
categories = ['alt.atheism', 'talk.religion.misc']
newsgroups = fetch_20newsgroups(subset='train', categories=categories)
# function to strip out headers from newsgroup posts, remove last full line
def clean_newsgroup_data_factory():
badline_re = re.compile(r'(^(>|From:|Subject:|Organization:'+
'|Distribution:|NNTP-Posting-Host:|Lines:|News-Software:'
+'|In article <.+?@.+>)|(writes:?|.com|.edu|.org)$)')
email_address_finder = re.compile(r'\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}\b', flags=re.IGNORECASE)
def clean_newsgroup_data(text):
return '\n'.join([email_address_finder.sub('', line)
for line in text.split('\n')
if badline_re.match(line.strip()) is None][:-1])
return clean_newsgroup_data
term_doc_matrix_factory = ST.TermDocMatrixFactory(
category_text_iter = ((categories[idx], text)
for idx, text
in zip(newsgroups.target, newsgroups.data)), # only pass in full category names
clean_function = clean_newsgroup_data_factory(),
nlp = nlp)
term_doc_matrix = term_doc_matrix_factory.build()
(term_doc_matrix
.get_scaled_f_score_scores_vs_background()
.iloc[:10])
# we can also use get_rudder_scores_vs_background,
# get_posterior_mean_ratio_scores_vs_background,
# or get_fisher_scores_vs_background
df = term_doc_matrix.get_term_freq_df()
df['Scaled f-score'] = term_doc_matrix.get_scaled_f_scores('alt.atheism', scaler_algo = 'percentile')
# other options include get_logistic_regression_coefs_l1, get_rudder_scores,
# get_posterior_mean_ratio_scores, and get_fisher_scores
df.sort_values('Scaled f-score', ascending = False).iloc[:10]
df['L2 coefs'], acc, baseline = term_doc_matrix.get_logistic_regression_coefs_l2('alt.atheism')
print 'classifier accuracy', acc
print 'majority-class baseline accuracy', baseline
df.sort_values('L2 coefs', ascending=False).iloc[:10]
import scattertext as ST
imp.reload(ST)
import scattertext
plt.rcParams['figure.figsize'] = (10, 10)
scatter_chart = ST.ScatterChart(term_doc_matrix,
jitter = 0, # 0.05 adds a bit of jitter
)
chart_df, chart_html = scatter_chart.draw('alt.atheism',
num_top_words_to_annotate = 0,
transform = ST.Scalers.percentile_ordinal,
scores = term_doc_matrix.get_posterior_mean_ratio_scores('alt.atheism'),
words_to_annotate=['lds', 'fallacy', 'assertion', 'god exists', 'logical', 'lie', 'existence',
'atheist', 'saw', 'shall be', 'paradise', 'argument', 'islamic', 'nation',
'master', 'jesus', 'abraham', 'moses', 'christ', 'jesus christ',
'judas', 'father', '', 'zoroastrian'])
visualization_fn = open('alt.atheism.html', 'w')
visualization_fn.write(chart_html)
print >>visualization_fn, '<h2>Top terms</h2>'
chart_df.sort_values(by='color_scores', ascending=False)[:30].to_html(visualization_fn)
print >>visualization_fn, '<h2>Bottom terms</h2>'
chart_df.sort_values(by='color_scores', ascending=False)[-30:].to_html(visualization_fn)
visualization_fn.close()