# Allow for displaying plots in Jupyter/IPython, also as .svg
%matplotlib inline
from IPython.display import SVG, display
# Data handling, structures and analysis tools, SQLite database interface
import numpy as np
import pandas as pd
import sqlite3
# 2D plotting library
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.dates as dates
#import matplotlib.gridspec as gridspec
# 2D visualization library overlaying matplotlib
import seaborn as sns
# Date- and time-parsing and manipulation
import time
import datetime
import dateutil.parser
import calendar
# Default options
mpl.rcParams.update({'axes.titlesize': 14})
mpl.rcParams.update({'axes.titleweight': 'bold'})
mpl.rcParams.update({'axes.labelweight': 'bold'})
sns.set(style="whitegrid", color_codes=True)
sns_line = {"color" : "r", "alpha" : 0.5}
x_variable = 'modification_date'
# Read database, attach as Pandas dataframe sorted by date
db = sqlite3.connect("Data/Applications.db")
df = pd.read_sql_query('SELECT * FROM applications ORDER BY Date(' + x_variable + ') DESC', db)
db.close()
df.head()
y_variable = 'Words'
fig = plt.figure()
points = plt.scatter(x = df['modification_date'],
y = df[y_variable], c=df[y_variable], s=75, cmap="BrBG")
plt.colorbar(points)
plot = sns.regplot(x = df['modification_date'],
y = df[y_variable], data=df, scatter=False, color='r')
ax = plt.gca()
xticks = ax.get_xticks()
xticks_dates = [datetime.datetime.fromtimestamp(x).strftime('%b %y') for x in xticks]
ax.set_xticklabels(xticks_dates)
sns.plt.title(y_variable + ' per Day')
ax.set(xlabel='Date')
plt.savefig('Plots/' + y_variable + '.svg', format='svg', dpi=300)
plt.close(fig)
display(SVG('Plots/' + y_variable + '.svg'))
y_variable = 'Sentences'
fig = plt.figure()
points = plt.scatter(x = df['modification_date'],
y = df[y_variable], c=df[y_variable], s=75, cmap="BrBG")
plt.colorbar(points)
plot = sns.regplot(x = df['modification_date'],
y = df[y_variable], data=df, scatter=False, color='r')
ax = plt.gca()
xticks = ax.get_xticks()
xticks_dates = [datetime.datetime.fromtimestamp(x).strftime('%b %y') for x in xticks]
ax.set_xticklabels(xticks_dates)
sns.plt.title(y_variable + ' per Day')
ax.set(xlabel='Date')
plt.savefig('Plots/' + y_variable + '.svg', format='svg', dpi=300)
plt.close(fig)
display(SVG('Plots/' + y_variable + '.svg'))
y_variable = 'Lines'
fig = plt.figure()
points = plt.scatter(x = df['modification_date'],
y = df[y_variable], c=df[y_variable], s=75, cmap="BrBG")
plt.colorbar(points)
plot = sns.regplot(x = df['modification_date'],
y = df[y_variable], data=df, scatter=False, color='r')
ax = plt.gca()
xticks = ax.get_xticks()
xticks_dates = [datetime.datetime.fromtimestamp(x).strftime('%b %y') for x in xticks]
ax.set_xticklabels(xticks_dates)
sns.plt.title(y_variable + ' per Day')
ax.set(xlabel='Date')
plt.savefig('Plots/' + y_variable + '.svg', format='svg', dpi=300)
plt.close(fig)
display(SVG('Plots/' + y_variable + '.svg'))
fig = plt.figure()
ax1 = plt.subplot(311)
y_variable = 'Words'
points = plt.scatter(x = df['modification_date'],
y = df[y_variable], c=df[y_variable], s=75, cmap="BrBG")
plt.colorbar(points)
sns.regplot(x = df['modification_date'],
y = df[y_variable], data=df, scatter=False, color='r')
plt.title('Descriptive Data')
plt.xlabel('')
ax1.set_xticklabels([])
ax2 = plt.subplot(312)
y_variable = 'Sentences'
points = plt.scatter(x = df['modification_date'],
y = df[y_variable], c=df[y_variable], s=75, cmap="BrBG")
plt.colorbar(points)
sns.regplot(x = df['modification_date'],
y = df[y_variable], data=df, scatter=False, color='r')
plt.xlabel('')
ax2.set_xticklabels([])
ax3 = plt.subplot(313)
y_variable = 'Lines'
points = plt.scatter(x = df['modification_date'],
y = df[y_variable], c=df[y_variable], s=75, cmap="BrBG")
plt.colorbar(points)
sns.regplot(x = df['modification_date'],
y = df[y_variable], data=df, scatter=False, color='r')
xticks = ax2.get_xticks()
xticks_dates = [datetime.datetime.fromtimestamp(x).strftime('%b %y') for x in xticks]
plt.xlabel('Date')
ax3.set_xticklabels(xticks_dates)
plt.tight_layout(pad=0.6, w_pad=0.8, h_pad=1.2)
plt.savefig('Plots/Descriptives.svg', format='svg', dpi=300)
plt.close(fig)
display(SVG('Plots/Descriptives.svg'))
# Calculate a Readability consensus
consensus = []
for grade in df['Readability']:
grade = grade.replace('th', '').replace(' grade', '').replace(' and ', '-')
numbers = grade.split('-')
numbers = [int(i) for i in numbers]
numbers_mean = np.mean(numbers)
consensus.append(numbers_mean)
df["consensus_mean"] = consensus
df.head()
sns.set_style("whitegrid", {'grid.linestyle': ''})
fig = plt.figure(figsize=(8,8))
nrows = 4
ncols = 2
ax1 = plt.subplot(nrows, ncols, 1)
sns.distplot(df['flesch_reading_ease'])
plt.title('Flesch Reading Ease')
plt.xlabel('0 is Confusing, 100 is Easy')
ax2 = plt.subplot(nrows, ncols, 2)
sns.distplot(df['flesch_kincaid_grade'])
plt.title('Flesch-Kincaid Grade Level')
plt.xlabel('Grade level needed to read the text')
ax3 = plt.subplot(nrows, ncols, 3)
sns.distplot(df['gunning_fog'])
plt.title('Gunning FOG Index')
plt.xlabel('Ideal readability score')
ax4 = plt.subplot(nrows, ncols, 4)
sns.distplot(df['smog_index'])
plt.title('SMOG Index')
plt.xlabel('Grade level needed to understand the text')
ax5 = plt.subplot(nrows, ncols, 5)
sns.distplot(df['automated_readability_index'])
plt.title('Automated Readability Index')
plt.xlabel('Grade level needed to comprehend the text')
ax6 = plt.subplot(nrows, ncols, 6)
sns.distplot(df['coleman_liau_index'])
plt.title('Coleman-Liau Index')
plt.xlabel('Grade level of the text')
ax7 = plt.subplot(nrows, ncols, 7)
sns.distplot(df['linsear_write_formula'])
plt.title('Linsear Write Formula')
plt.xlabel('Grade level of the text')
ax8 = plt.subplot(nrows, ncols, 8)
sns.distplot(df['dale_chall_readability_score'])
plt.title('Dale-Chall Readability Score')
plt.xlabel('Grade level of the text')
plt.tight_layout(pad=0.6, w_pad=0.8, h_pad=1.2)
plt.savefig('Plots/Textstats.svg', format='svg', dpi=300)
plt.close(fig)
display(SVG('Plots/Textstats.svg'))
sns.set_style("whitegrid", {'grid.linestyle': '-'})
y_variable = 'consensus_mean'
fig = plt.figure(figsize=(8, 6))
points = plt.scatter(x = df[x_variable],
y = df[y_variable], c=df[y_variable], s=75, cmap="BrBG")
plt.colorbar(points)
plot = sns.regplot(x = df[x_variable],
y = df[y_variable], data=df, scatter=False, line_kws = sns_line)
ax = plt.gca()
xticks = ax.get_xticks()
xticks_dates = [datetime.datetime.fromtimestamp(x).strftime('%b %y') for x in xticks]
ax.set_xticklabels(xticks_dates)
sns.plt.title('Development of required comprehension', y = 1.04)
ax.set(ylabel='Grade Level', xlabel='Date')
plt.ylim([(df[y_variable].min()-1), (df[y_variable].max()+1)])
plt.xlim([(df[x_variable].min()-60*60*24*7*2), (df[x_variable].max()+60*60*24*7*2)])
plt.tight_layout(pad=0.6, w_pad=0.8, h_pad=1.2)
plt.savefig('Plots/ReadabilityConsensus.svg', format='svg', dpi=300)
plt.close(fig)
display(SVG('Plots/ReadabilityConsensus.svg'))
words = df['Words'].tolist()
# Pass Words to Numpy array, split in three parts, then find bounds
words = np.array(words)
words_sorted = np.sort(words)
words_sorted_split = np.array_split(words_sorted, 3)
words_short_max = np.amax(words_sorted_split[0])
words_long_min = np.amin(words_sorted_split[2])
# Using bounds, create `WordClass` column with categorical designations of length
df.loc[df['Words'] <= words_short_max,'WordClass'] = 'Short'
df.loc[(df['Words'] > words_short_max) & (df['Words'] < words_long_min), 'WordClass'] = 'Medium'
df.loc[df['Words'] >= words_long_min,'WordClass'] = 'Long'
df['WordClass'] = df['WordClass'].astype('category')
df.head()
fig = plt.figure()
sns.barplot(data = df, x = "WordClass", y = "Words", order = ['Short', 'Medium', 'Long'],
palette = "Blues_d")
ax = plt.gca()
sns.plt.title('Distribution of Words', y = 1.04)
ax.set(ylabel = 'Words, mean', xlabel = '')
plt.tight_layout(pad=0.6, w_pad=0.8, h_pad=1.2)
plt.savefig('Plots/WordDistribution.svg', format='svg', dpi=300)
plt.close(fig)
display(SVG('Plots/WordDistribution.svg'))
# Words Per Minute levels per grade
gradeLevelTimes = (150, 250, 300, 350, 450, 575, 675, 800)
# Calculate seconds to read at each grade and insert into Dataframe
for number in gradeLevelTimes:
df[str(number) + '_WordsSecs'] = (df['Words'] / number) * 60
df.head()
# Calculate mean reading times for each category, insert into new DataFrame
dfGradeTimes = pd.DataFrame(columns=('Short', 'Medium', 'Long'))
for number in gradeLevelTimes:
dfGradeTimes.loc[number] = [df[str(number) + '_WordsSecs'].where(df['WordClass'] == 'Short').mean(),
df[str(number) + '_WordsSecs'].where(df['WordClass'] == 'Medium').mean(),
df[str(number) + '_WordsSecs'].where(df['WordClass'] == 'Long').mean()]
dfGradeTimes
sns.set_style("whitegrid", {'grid.linestyle': ''})
y_variable = 'Short'
fig = plt.figure(figsize=(8, 6))
sns.barplot(data = dfGradeTimes, x = list(dfGradeTimes.index.values), y = y_variable,
palette = "BrBG")
ax = plt.gca()
sns.plt.title('Time to read, ' + y_variable + ' Applications', y = 1.04)
ax.set(ylabel = 'Seconds, mean', xlabel = 'Words per Minute')
plt.savefig('Plots/WPM_' + y_variable + '.svg', format='svg', dpi=300)
plt.close(fig)
display(SVG('Plots/WPM_' + y_variable + '.svg'))
sns.set_style("whitegrid", {'grid.linestyle': ''})
y_variable = 'Medium'
fig = plt.figure(figsize=(8, 6))
sns.barplot(data = dfGradeTimes, x = list(dfGradeTimes.index.values), y = y_variable,
palette = "BrBG")
ax = plt.gca()
sns.plt.title('Time to read, ' + y_variable + ' Applications', y = 1.04)
ax.set(ylabel = 'Seconds, mean', xlabel = 'Words per Minute')
plt.savefig('Plots/WPM_' + y_variable + '.svg', format='svg', dpi=300)
plt.close(fig)
display(SVG('Plots/WPM_' + y_variable + '.svg'))
sns.set_style("whitegrid", {'grid.linestyle': ''})
y_variable = 'Long'
fig = plt.figure(figsize=(8, 6))
sns.barplot(data = dfGradeTimes, x = list(dfGradeTimes.index.values), y = y_variable,
palette = "BrBG")
ax = plt.gca()
sns.plt.title('Time to read, ' + y_variable + ' Applications', y = 1.04)
ax.set(ylabel = 'Seconds, mean', xlabel = 'Words per Minute')
plt.savefig('Plots/WPM_' + y_variable + '.svg', format='svg', dpi=300)
plt.close(fig)
display(SVG('Plots/WPM_' + y_variable + '.svg'))
sns.set_style("whitegrid")
fig = plt.figure(figsize=(8, 6))
nrows = 2
ncols = 3
ax0 = plt.subplot(nrows, 1, 2)
sns.barplot(data = df, x = "WordClass", y = "Words", order = ['Short', 'Medium', 'Long'],
palette = "Blues_d")
ax0 = plt.gca()
sns.plt.title('Distribution of Words', y = 1.04)
ax0.set(ylabel = 'Words, mean', xlabel = '')
y_variable = 'Short'
ax1 = plt.subplot(nrows, ncols, 1)
sns.barplot(data = dfGradeTimes, x = list(dfGradeTimes.index.values), y = y_variable,
palette = "BrBG")
plt.ylabel('Seconds, mean')
plt.xlabel('Words /min, ' + y_variable)
ax1.spines["right"].set_visible(False)
y_variable = 'Medium'
ax2 = plt.subplot(nrows, ncols, 2, sharey=ax1)
sns.barplot(data = dfGradeTimes, x = list(dfGradeTimes.index.values), y = y_variable,
palette = "BrBG")
plt.ylabel('')
plt.xlabel('Words /min, ' + y_variable)
ax2.tick_params(axis='y', which='both', labelleft='off')
ax2.spines["left"].set_visible(False)
ax2.spines["right"].set_visible(False)
y_variable = 'Long'
ax3 = plt.subplot(nrows, ncols, 3, sharey=ax2)
sns.barplot(data = dfGradeTimes, x = list(dfGradeTimes.index.values), y = y_variable,
palette = "BrBG")
plt.ylabel('')
plt.xlabel('Words /min, ' + y_variable)
ax3.tick_params(axis='y', which='both', labelleft='off')
ax3.spines["left"].set_visible(False)
fig.suptitle('Seconds to Read', fontsize = 16, fontweight = 'bold')
plt.tight_layout(pad=2.6, w_pad=0, h_pad=1.2)
plt.savefig('Plots/ReadingTime.svg', format='svg', dpi=300)
plt.close(fig)
display(SVG('Plots/ReadingTime.svg'))