# General packages: Operating System Interface, System-specific parameters and functions,
# Regular expression operations, Unix style pathname pattern expansion
import os, platform, sys, re, json, glob
# Statistics from text functions, Date-parsing and manipulation
from textstat.textstat import textstat
import time
import datetime
import dateutil.parser
import calendar
# Function for determing if string is a date
def is_date(string):
try:
dateutil.parser.parse(string)
return True
except ValueError:
return False
# Function for converting a datetime to timestamp
def returnts(datestring):
x = dateutil.parser.parse(datestring)
stamp = calendar.timegm(x.timetuple())
y = datetime.datetime.utcfromtimestamp(stamp)
return stamp
# Function for finding file creation date
def creation_date(path_to_file):
"""
Try to get the date that a file was created, falling back to when it was
last modified if that isn't possible.
See http://stackoverflow.com/a/39501288/1709587 for explanation.
"""
if platform.system() == 'Windows':
return os.path.getctime(path_to_file)
else:
stat = os.stat(path_to_file)
try:
return stat.st_birthtime
except AttributeError:
# We're probably on Linux. No easy way to get creation dates here,
# so we'll settle for when its content was last modified.
return stat.st_mtime
# Function for replacing word with word from a dictionary
def multiwordReplace(text, wordDic):
"""
take a text and replace words that match a key in a dictionary with
the associated value, return the changed text
"""
rc = re.compile('|'.join(map(re.escape, wordDic)))
def translate(match):
return wordDic[match.group(0)]
return rc.sub(translate, text)
# Norwegian to English month-dictionary
monthDict = {
'januar': 'january',
'februar': 'february',
'mars': 'march',
'april': 'april',
'mai': 'may',
'juni': 'june',
'juli': 'july',
'august': 'august',
'september': 'september',
'oktober': 'october',
'november': 'november',
'desember': 'december'
}
Based on DocToTXT.vba
, which produces clean .txt-versions of .doc-files in ANSI.
# *NOTE*: This operation requires significant processing power and time.
data = {}
# Define common introductions to Applications
intro_strings = ('Application to', 'Søknad til')
# Define common endings to Applications
outro_strings = ('Best regards', 'Med vennlig hilsen')
# Define locations prepending dates in Applications
locations = {'Bergen':'', 'Oslo':'', 'Tromsø':''}
# Declare known Applications yielding an interview
interviews = ('IMDi Vest', 'Manpower', 'NHH', 'Nord Universitet', 'PVS', 'Ramsalt', 'SampolSosiologi')
# Declare known Applications yielding an offer
offers = ('Eksamensvakt, UiB', 'IMDi Vest', 'Manpower', 'NHH', 'PVS', 'Sixt Biluteleie AS')
# Declare known Applications yielding a reply
replies = ('AdmOrg', 'Null')
# Loop over converted .txt-files, counting iteratively
index = 0
for f in glob.glob('./Data/DocConverted/*.txt'):
# Assume `latin-1` encoding, commensurate with MS Word saving as ANSI
inputfile = open(f, 'r', encoding='latin-1')
# Read file by lines
lines = inputfile.readlines()
inputfile.close()
# Clean up title
title = f.replace('./Data/DocConverted', '').replace('\\', '').replace(' - Ole Vik.txt', '')
data[index] = {}
data[index]['Title'] = title
data[index]['Date'] = None
# Iterate over lines
for num, line in enumerate(lines, 1):
if any(string in line for string in locations):
date = line
for k, v in locations.items():
date = date.replace(k + ' ', v)
# Convert month-names for date-recognition
date = multiwordReplace(date, monthDict)
if is_date(date):
timestamp = returnts(date)
data[index]['Date'] = timestamp
# Search for start of Application
if any(string in line for string in intro_strings):
intro = num
# Search for end of Application
if any(string in line for string in outro_strings):
outro = num
# Compile Application from start to end, assign as `Content`
obscured_lines = lines[intro:outro]
data[index]['Content'] = (obscured_lines)
joined_content = ' '.join(obscured_lines)
# Check for and assign Interview, Offer, Reply
data[index]['Results'] = {}
if any(string == title for string in interviews):
data[index]['Results']['Interview'] = True
else:
data[index]['Results']['Interview'] = False
if any(string == title for string in offers):
data[index]['Results']['Offer'] = True
else:
data[index]['Results']['Offer'] = False
if any(string == title for string in replies):
data[index]['Results']['Reply'] = True
else:
data[index]['Results']['Reply'] = False
# Assign descriptive statistics
data[index]['Descriptive'] = {}
data[index]['Descriptive']['Words'] = textstat.lexicon_count(obscured_lines)
data[index]['Descriptive']['Sentences'] = textstat.sentence_count(joined_content)
data[index]['Descriptive']['Lines'] = len(obscured_lines)
#data[index]['Descriptive']['Paragraphs'] = None
# Create analytical readability-statistics
analytical = {
'flesch_reading_ease': textstat.flesch_reading_ease(joined_content),
'smog_index': textstat.smog_index(joined_content),
'flesch_kincaid_grade': textstat.flesch_kincaid_grade(joined_content),
'coleman_liau_index': textstat.coleman_liau_index(joined_content),
'automated_readability_index': textstat.automated_readability_index(joined_content),
'dale_chall_readability_score': textstat.dale_chall_readability_score(joined_content),
'difficult_words': textstat.difficult_words(joined_content),
'linsear_write_formula': textstat.linsear_write_formula(joined_content),
'gunning_fog': textstat.gunning_fog(joined_content),
'text_standard': textstat.text_standard(joined_content)
}
# Assign analytical statistics
data[index]['Analytical'] = {}
data[index]['Analytical']['Readability'] = analytical
#data[index]['Analytical']['Clarity'] = {}
#data[index]['Analytical']['Applicability'] = {}
original_file = f.replace('./Data/DocConverted', './Data/Doc').replace('.txt', '.doc')
data[index]['creation_date'] = creation_date(original_file)
data[index]['modification_date'] = os.path.getmtime(original_file)
index += 1
# Save `data` to JSON-file
try:
with open('Data/Applications.json', 'w') as json_file:
json.dump(data, json_file, ensure_ascii=False)
pass
except IOError as e:
print (e)
pass
finally:
print ('Saved Data/Applications.json with ' + str(len(data)) + ' items.')