-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmax_student_scores_cli.py
More file actions
executable file
·125 lines (98 loc) · 3.93 KB
/
max_student_scores_cli.py
File metadata and controls
executable file
·125 lines (98 loc) · 3.93 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
#! /usr/bin/env python3
import argparse
import numpy as np
import os.path
import pandas as pd
import re
# Upload reference data
# Query with list of IDs
# Qualtrics master list
def check_ext(ext_choices):
"""
Checks if file exists and has an extension in ext_choices.
Adapted from: https://stackoverflow.com/questions/15203829
"""
class Act(argparse.Action):
def __call__(self,parser,namespace,fname,option_string=None):
if not os.path.exists(fname[0]):
parser.error('file {} does not exist'
.format(fname[0]) )
ext = os.path.splitext(fname[0])[1][1:].lower()
if ext not in ext_choices:
parser.error("file doesn't end with one of {}"
.format(', '.join(ext_choices)) )
else:
setattr(namespace,self.dest,fname[0])
return Act
def prep_input_args(args):
"""
Process the user input arguments. Default parsing should be
handled here.
"""
arg_dict = {
'score_csv' : args.score_csv,
'student_id_excel' : args.student_id_excel,
}
return arg_dict
def standardize_id_formats(x):
"""
Remove invalid ID numbers. May add more advanced cleaning later.
"""
try:
return np.int64(x)
except (ValueError, OverflowError):
return np.nan
def filter_columns(field_name):
columns_list = ['score', 'Enter your Student ID number:']
return (field_name.strip() in columns_list)
def prepare_data(score_csv,student_id_excel):
"""
Loading and cleaning up the data used for the analysis
"""
score_df = pd.read_csv(score_csv,
skiprows=1,
usecols=filter_columns)
score_df.rename(columns=lambda x: x.strip(), inplace=True)
# Can coerce to standard form here, pick up badly formatted columns
score_df['Enter your Student ID number:'] = \
score_df['Enter your Student ID number:'] \
.apply(standardize_id_formats)
score_df['score'] = pd.to_numeric(score_df['score'], errors='coerce')
score_df.dropna(axis=0, inplace=True)
student_id_df = pd.read_excel(student_id_excel)
student_id_df['Student ID'] = student_id_df['Student ID'].astype(np.int64)
return score_df, student_id_df
def write_max_scores(score_df,student_id_df):
# Get just the Student IDs we're interested in
sub_df = score_df[score_df['Enter your Student ID number:']
.isin(student_id_df['Student ID'])]
# Student IDs not in the Qualtrics input file
# are given a score of 0
not_taken = set(student_id_df['Student ID'].values) - \
set(sub_df['Enter your Student ID number:'].values)
sub_df = sub_df.append(pd.DataFrame({
'Enter your Student ID number:' : [ sid for sid in not_taken ],
'score' : [ 0 for sid in not_taken ],
}))
max_scores = sub_df.groupby('Enter your Student ID number:').max()
max_scores.to_excel('max_scores.xlsx')
def max_student_scores(arg_dict):
score_df, student_id_df = prepare_data(
arg_dict['score_csv'],
arg_dict['student_id_excel'],
)
write_max_scores(score_df,student_id_df)
if __name__ == '__main__':
parser=argparse.ArgumentParser(description=('Takes a file containing a'
' list of product ID\'s and the name of the website they were'
' taken from as argument. The program then scrapes all'
' product reviews and additional (site-dependent) information.'))
parser.add_argument('score_csv', nargs=1, action=check_ext(['csv']),
help=('Path to CSV file containing scores'))
parser.add_argument('student_id_excel', nargs=1,
action=check_ext(['xlsx','xlsm']),
help=('Path to Excel file containing the desired'
' student ID numbers.'))
args = parser.parse_args()
arg_dict = prep_input_args(args)
max_student_scores(arg_dict)