Build JSON

In [1]:
# General packages: Operating System Interface, System-specific parameters and functions, 
# Regular expression operations, Unix style pathname pattern expansion
import os, platform, sys, re, json, glob

# Statistics from text functions, Date-parsing and manipulation
from textstat.textstat import textstat
import time
import datetime
import dateutil.parser
import calendar

# Function for determing if string is a date
def is_date(string):
    try: 
        dateutil.parser.parse(string)
        return True
    except ValueError:
        return False

# Function for converting a datetime to timestamp
def returnts(datestring):
    x = dateutil.parser.parse(datestring)
    stamp = calendar.timegm(x.timetuple())
    y = datetime.datetime.utcfromtimestamp(stamp)
    return stamp

# Function for finding file creation date
def creation_date(path_to_file):
    """
    Try to get the date that a file was created, falling back to when it was
    last modified if that isn't possible.
    See http://stackoverflow.com/a/39501288/1709587 for explanation.
    """
    if platform.system() == 'Windows':
        return os.path.getctime(path_to_file)
    else:
        stat = os.stat(path_to_file)
        try:
            return stat.st_birthtime
        except AttributeError:
            # We're probably on Linux. No easy way to get creation dates here,
            # so we'll settle for when its content was last modified.
            return stat.st_mtime

# Function for replacing word with word from a dictionary
def multiwordReplace(text, wordDic):
    """
    take a text and replace words that match a key in a dictionary with
    the associated value, return the changed text
    """
    rc = re.compile('|'.join(map(re.escape, wordDic)))
    def translate(match):
        return wordDic[match.group(0)]
    return rc.sub(translate, text)

# Norwegian to English month-dictionary
monthDict = {
    'januar': 'january',
    'februar': 'february',
    'mars': 'march',
    'april': 'april',
    'mai': 'may',
    'juni': 'june',
    'juli': 'july',
    'august': 'august',
    'september': 'september',
    'oktober': 'october',
    'november': 'november',
    'desember': 'december'
}

Data Operations

Based on DocToTXT.vba, which produces clean .txt-versions of .doc-files in ANSI.

In [2]:
# *NOTE*: This operation requires significant processing power and time.

data = {}

# Define common introductions to Applications
intro_strings = ('Application to', 'Søknad til')
# Define common endings to Applications
outro_strings = ('Best regards', 'Med vennlig hilsen')
# Define locations prepending dates in Applications
locations = {'Bergen':'', 'Oslo':'', 'Tromsø':''}

# Declare known Applications yielding an interview
interviews = ('IMDi Vest', 'Manpower', 'NHH', 'Nord Universitet', 'PVS', 'Ramsalt', 'SampolSosiologi')
# Declare known Applications yielding an offer
offers = ('Eksamensvakt, UiB', 'IMDi Vest', 'Manpower', 'NHH', 'PVS', 'Sixt Biluteleie AS')
# Declare known Applications yielding a reply
replies = ('AdmOrg', 'Null')

# Loop over converted .txt-files, counting iteratively
index = 0
for f in glob.glob('./Data/DocConverted/*.txt'):
    # Assume `latin-1` encoding, commensurate with MS Word saving as ANSI
    inputfile = open(f, 'r', encoding='latin-1')
    # Read file by lines
    lines = inputfile.readlines()
    inputfile.close()
    # Clean up title
    title = f.replace('./Data/DocConverted', '').replace('\\', '').replace(' - Ole Vik.txt', '')
    
    data[index] = {}
    data[index]['Title'] = title
    data[index]['Date'] = None
    
    # Iterate over lines
    for num, line in enumerate(lines, 1):
        if any(string in line for string in locations):
            date = line
            for k, v in locations.items():
                date = date.replace(k + ' ', v)
            # Convert month-names for date-recognition
            date = multiwordReplace(date, monthDict)
            if is_date(date):
                timestamp = returnts(date)
                data[index]['Date'] = timestamp
        # Search for start of Application
        if any(string in line for string in intro_strings):
            intro = num
        # Search for end of Application
        if any(string in line for string in outro_strings):
            outro = num
    # Compile Application from start to end, assign as `Content`
    obscured_lines = lines[intro:outro]
    data[index]['Content'] = (obscured_lines)
    
    joined_content = ' '.join(obscured_lines)    
    
    # Check for and assign Interview, Offer, Reply
    data[index]['Results'] = {}
    if any(string == title for string in interviews):
        data[index]['Results']['Interview'] = True
    else:
        data[index]['Results']['Interview'] = False
    if any(string == title for string in offers):
        data[index]['Results']['Offer'] = True
    else:
        data[index]['Results']['Offer'] = False
    if any(string == title for string in replies):
        data[index]['Results']['Reply'] = True
    else:
        data[index]['Results']['Reply'] = False
    
    # Assign descriptive statistics
    data[index]['Descriptive'] = {}
    data[index]['Descriptive']['Words'] = textstat.lexicon_count(obscured_lines)
    data[index]['Descriptive']['Sentences'] = textstat.sentence_count(joined_content)
    data[index]['Descriptive']['Lines'] = len(obscured_lines)
    #data[index]['Descriptive']['Paragraphs'] = None
    
    # Create analytical readability-statistics
    analytical = {
        'flesch_reading_ease': textstat.flesch_reading_ease(joined_content),
        'smog_index': textstat.smog_index(joined_content),
        'flesch_kincaid_grade': textstat.flesch_kincaid_grade(joined_content),
        'coleman_liau_index': textstat.coleman_liau_index(joined_content),
        'automated_readability_index': textstat.automated_readability_index(joined_content),
        'dale_chall_readability_score': textstat.dale_chall_readability_score(joined_content),
        'difficult_words': textstat.difficult_words(joined_content),
        'linsear_write_formula': textstat.linsear_write_formula(joined_content),
        'gunning_fog': textstat.gunning_fog(joined_content),
        'text_standard': textstat.text_standard(joined_content)
    }
    
    # Assign analytical statistics
    data[index]['Analytical'] = {}
    data[index]['Analytical']['Readability'] = analytical
    #data[index]['Analytical']['Clarity'] = {}
    #data[index]['Analytical']['Applicability'] = {}
        
    original_file = f.replace('./Data/DocConverted', './Data/Doc').replace('.txt', '.doc')
    data[index]['creation_date'] = creation_date(original_file)
    data[index]['modification_date'] = os.path.getmtime(original_file)
    
    index += 1
In [3]:
# Save `data` to JSON-file
try:
    with open('Data/Applications.json', 'w') as json_file:
        json.dump(data, json_file, ensure_ascii=False)
    pass
except IOError as e:
    print (e)
    pass
finally:
    print ('Saved Data/Applications.json with ' + str(len(data)) + ' items.')
Saved Data/Applications.json with 107 items.