-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathclean_input_csv.py
More file actions
executable file
·168 lines (134 loc) · 5.3 KB
/
clean_input_csv.py
File metadata and controls
executable file
·168 lines (134 loc) · 5.3 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
#!/usr/bin/env python3
## Purpose: Allows user to keep additional data in their CSV Input File to increase its usefulness and
## enhance readability, yet be able to properly format it prior to importing into `cryptools`.
## e.g.:
## -Keep an additional first column for flagging/noting important transactions
## -Keep additional columns for tracking a running balance
## -Rows beneath transactions for life-to-date totals and other calculations and notes
## -Ability to use number formatting with parenthesis for negative numbers and commas
## -This script will change (1,000.00) to 1000.00
##
## If a column doesn't have a header, this script will exclude it from the sanitized output.
## Similarly, this script will exclude transaction rows missing data in either of the first
## two fields of the row.
## Usage:
# 1. Export/Save crypto activity as csv
# 2. Move the csv file to your desired directory
# 3. Rename file to <unedited>.csv (see variable below)
# 4. Build/run this file in an editor or on command line (from same directory), creating the input file
# 5. Import the input file into cryptools
import csv
import re
import os
import sys
unedited = "DigiTrnx.csv" # To be replaced with a launch arg, presumably
stage1 = "stage1.csv"
## First, writes all header rows. Then attempts to write all transaction rows.
## In the transaction rows, if it finds blank/empty transaction date or proceeds fields,
## it discards the row.
## This allows notes/sums/calculations/etc under the transaction rows to be discarded
with open(unedited) as fin, open(stage1, 'a') as fout:
rdr = csv.reader(fin)
wtr = csv.writer(fout)
header = next(rdr)
header2 = next(rdr)
header3 = next(rdr)
header4 = next(rdr)
wtr.writerow(header)
wtr.writerow(header2)
wtr.writerow(header3)
wtr.writerow(header4)
# First, double check there are no account number duplicates
for i, val in enumerate(header):
if val != "":
if header.count(val) > 1:
print("### There is a duplicate account number (" + val +"). Please fix and re-run. ###")
sys.exit()
for row in rdr:
if row[0] == "" or row[1] == "":
pass
else:
wtr.writerow(row)
stage2 = "stage2.csv"
## Iterates over the fields in the first header row to search for empty/blank cells.
## Keeps a list of every column index that does contain data, and disregards all the
## indices for columns with a blank.
## Using the indicies of valid columns, writes a new CSV file using only valid columns.
## This is useful when the input file is also used to manually keep a running tally or
## columns with additional notes, but which must be discarded to prepare a proper
## CSV input file.
with open(stage1) as fin, open(stage2, 'a') as fout:
rdr = csv.reader(fin)
wtr = csv.writer(fout)
header = next(rdr)
header2 = next(rdr)
header3 = next(rdr)
header4 = next(rdr)
colListKept = []
for col in header:
if col == "":
pass
else:
colListKept.append(header.index(col))
output = [v for (i,v) in enumerate(header) if i in colListKept]
wtr.writerow(output)
output = [v for (i,v) in enumerate(header2) if i in colListKept]
wtr.writerow(output)
output = [v for (i,v) in enumerate(header3) if i in colListKept]
wtr.writerow(output)
output = [v for (i,v) in enumerate(header4) if i in colListKept]
wtr.writerow(output)
for row in rdr:
output = [v for (i,v) in enumerate(row) if i in colListKept]
wtr.writerow(output)
stage3 = "InputFile-pycleaned.csv"
## Performs final formatting changes to ensure values can be successfully parsed.
## Numbers must have commas removed. Negative numbers must have parentheses replaced
## with a minus sign. Could also be used to substitute the date separation character.
## i.e., (1.01) -> -1.01 (1,000.00) -> -1000.00
with open(stage2) as fin, open(stage3, 'w') as fout:
rdr = csv.reader(fin, quoting=csv.QUOTE_ALL)
wtr = csv.writer(fout)
header = next(rdr)
header2 = next(rdr)
header3 = next(rdr)
header4 = next(rdr)
wtr.writerow(header)
wtr.writerow(header2)
wtr.writerow(header3)
wtr.writerow(header4)
for row in rdr:
listRow = []
for field in row:
fieldStr = str(field) # cast as string, just so there's no funny business
try:
# Handles negative numbers
if fieldStr[0] == "(":
fieldStr = fieldStr.replace('(','-').replace(')', '').replace(',', '')
listRow.append(fieldStr)
continue
# Uncomment the below and modify as necessary if you want to change date formatting
# elif re.search(r'\d\d-\d\d-\d\d',fieldStr):# Find dates and change formatting
# fieldStr = fieldStr.replace('-', '/')
# listRow.append(fieldStr)
# continue
# Handle commas in remaining fields
else:
try:
# if you remove commas from a string and are able to convert to float...
fieldStr_test = fieldStr.replace(',', '')
fieldStr_float = float(fieldStr_test)
# then it is definitely a positive number, so remove the comma.
fieldStr = fieldStr.replace(',', '')
listRow.append(fieldStr)
continue
except: # If the 'try' block fails, it's a memo, not a number, so leave any commas
listRow.append(fieldStr)
continue
except: # If the `try` block fails, it's a blank/empty string
listRow.append(fieldStr)
continue
wtr.writerow(listRow)
os.remove(stage1)
os.remove(stage2)
print("Input file ready")