score_selection_interface/max_student_scores_cli.py at master · magee256/score_selection_interface · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
#! /usr/bin/env python3

import argparse
import numpy as np
import os.path
import pandas as pd
import re


# Upload reference data
# Query with list of IDs
# Qualtrics master list

def check_ext(ext_choices):
    """
    Checks if file exists and has an extension in ext_choices.

    Adapted from: https://stackoverflow.com/questions/15203829
    """
    class Act(argparse.Action):
        def __call__(self,parser,namespace,fname,option_string=None):
            if not os.path.exists(fname[0]):
                parser.error('file {} does not exist'
                        .format(fname[0]) )

            ext = os.path.splitext(fname[0])[1][1:].lower()
            if ext not in ext_choices:
                parser.error("file doesn't end with one of {}"
			.format(', '.join(ext_choices)) )
            else:
                setattr(namespace,self.dest,fname[0])

    return Act


def prep_input_args(args):
    """
    Process the user input arguments. Default parsing should be
    handled here.
    """
    arg_dict = {
            'score_csv' : args.score_csv,
            'student_id_excel' : args.student_id_excel,
            }
    return arg_dict


def standardize_id_formats(x):
    """
    Remove invalid ID numbers. May add more advanced cleaning later.
    """
    try:
        return np.int64(x)
    except (ValueError, OverflowError):
        return np.nan


def filter_columns(field_name):
    columns_list = ['score', 'Enter your Student ID number:']
    return (field_name.strip() in columns_list)


def prepare_data(score_csv,student_id_excel):
    """
    Loading and cleaning up the data used for the analysis
    """
    score_df = pd.read_csv(score_csv,
            skiprows=1,
            usecols=filter_columns)
    score_df.rename(columns=lambda x: x.strip(), inplace=True)

    # Can coerce to standard form here, pick up badly formatted columns
    score_df['Enter your Student ID number:'] = \
            score_df['Enter your Student ID number:'] \
                            .apply(standardize_id_formats)
    score_df['score'] = pd.to_numeric(score_df['score'], errors='coerce')
    score_df.dropna(axis=0, inplace=True)

    student_id_df = pd.read_excel(student_id_excel)
    student_id_df['Student ID'] = student_id_df['Student ID'].astype(np.int64)
    return score_df, student_id_df


def write_max_scores(score_df,student_id_df):
    # Get just the Student IDs we're interested in
    sub_df = score_df[score_df['Enter your Student ID number:']
            .isin(student_id_df['Student ID'])]

    # Student IDs not in the Qualtrics input file
    # are given a score of 0
    not_taken = set(student_id_df['Student ID'].values) - \
            set(sub_df['Enter your Student ID number:'].values)
    sub_df = sub_df.append(pd.DataFrame({
        'Enter your Student ID number:' : [ sid for sid in not_taken ],
        'score' : [ 0 for sid in not_taken ],
        }))

    max_scores = sub_df.groupby('Enter your Student ID number:').max()
    max_scores.to_excel('max_scores.xlsx')


def max_student_scores(arg_dict):
    score_df, student_id_df = prepare_data(
            arg_dict['score_csv'],
            arg_dict['student_id_excel'],
            )
    write_max_scores(score_df,student_id_df)


if __name__ == '__main__':
    parser=argparse.ArgumentParser(description=('Takes a file containing a'
            ' list of product ID\'s and the name of the website they were'
            ' taken from as argument. The program then scrapes all'
            ' product reviews and additional (site-dependent) information.'))

    parser.add_argument('score_csv', nargs=1, action=check_ext(['csv']),
            help=('Path to CSV file containing scores'))
    parser.add_argument('student_id_excel', nargs=1,
            action=check_ext(['xlsx','xlsm']),
            help=('Path to Excel file containing the desired'
                ' student ID numbers.'))

    args = parser.parse_args()
    arg_dict = prep_input_args(args)
    max_student_scores(arg_dict)