cryptools/clean_input_csv.py at master · scoobybejesus/cryptools · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
#!/usr/bin/env python3

## Purpose:	Allows user to keep additional data in their CSV Input File to increase its usefulness and
##			enhance readability, yet be able to properly format it prior to importing into `cryptools`.
##			e.g.:
##				-Keep an additional first column for flagging/noting important transactions
##				-Keep additional columns for tracking a running balance
##				-Rows beneath transactions for life-to-date totals and other calculations and notes
##				-Ability to use number formatting with parenthesis for negative numbers and commas
##					-This script will change (1,000.00) to 1000.00
##
##			If a column doesn't have a header, this script will exclude it from the sanitized output.
##			Similarly, this script will exclude transaction rows missing data in either of the first
##			two fields of the row.

## Usage:

# 1.  Export/Save crypto activity as csv
# 2.  Move the csv file to your desired directory
# 3.  Rename file to <unedited>.csv (see variable below)
# 4.  Build/run this file in an editor or on command line (from same directory), creating the input file
# 5.  Import the input file into cryptools

import csv
import re
import os
import sys

unedited = "DigiTrnx.csv" 	# To be replaced with a launch arg, presumably

stage1 = "stage1.csv"

##		First, writes all header rows.  Then attempts to write all transaction rows.
##      In the transaction rows, if it finds blank/empty transaction date or proceeds fields,
##		it discards the row.

##		This allows notes/sums/calculations/etc under the transaction rows to be discarded

with open(unedited) as fin, open(stage1, 'a') as fout:
	rdr = csv.reader(fin)
	wtr = csv.writer(fout)
	header = next(rdr)
	header2 = next(rdr)
	header3 = next(rdr)
	header4 = next(rdr)
	wtr.writerow(header)
	wtr.writerow(header2)
	wtr.writerow(header3)
	wtr.writerow(header4)

	# First, double check there are no account number duplicates
	for i, val in enumerate(header):
		if val != "":
			if header.count(val) > 1:
				print("### There is a duplicate account number (" + val +").  Please fix and re-run. ###")
				sys.exit()

	for row in rdr:
		if row[0] == "" or row[1] == "":
			pass
		else:
			wtr.writerow(row)

stage2 = "stage2.csv"

##		Iterates over the fields in the first header row to search for empty/blank cells.
##      Keeps a list of every column index that does contain data, and disregards all the
##		indices for columns with a blank.

##		Using the indicies of valid columns, writes a new CSV file using only valid columns.

##		This is useful when the input file is also used to manually keep a running tally or
##		columns with additional notes, but which must be discarded to prepare a proper
##		CSV input file.

with open(stage1) as fin, open(stage2, 'a') as fout:
	rdr = csv.reader(fin)
	wtr = csv.writer(fout)
	header = next(rdr)
	header2 = next(rdr)
	header3 = next(rdr)
	header4 = next(rdr)

	colListKept = []

	for col in header:
		if col == "":
			pass
		else:
			colListKept.append(header.index(col))

	output = [v for (i,v) in enumerate(header) if i in colListKept]
	wtr.writerow(output)

	output = [v for (i,v) in enumerate(header2) if i in colListKept]
	wtr.writerow(output)

	output = [v for (i,v) in enumerate(header3) if i in colListKept]
	wtr.writerow(output)

	output = [v for (i,v) in enumerate(header4) if i in colListKept]
	wtr.writerow(output)

	for row in rdr:
		output = [v for (i,v) in enumerate(row) if i in colListKept]
		wtr.writerow(output)


stage3 = "InputFile-pycleaned.csv"

##		Performs final formatting changes to ensure values can be successfully parsed.
##	 	Numbers must have commas removed.  Negative numbers must have parentheses replaced
##		with a minus sign.  Could also be used to substitute the date separation character.

##	 	i.e., (1.01) -> -1.01  (1,000.00) -> -1000.00

with open(stage2) as fin, open(stage3, 'w') as fout:
	rdr = csv.reader(fin, quoting=csv.QUOTE_ALL)
	wtr = csv.writer(fout)

	header = next(rdr)
	header2 = next(rdr)
	header3 = next(rdr)
	header4 = next(rdr)
	wtr.writerow(header)
	wtr.writerow(header2)
	wtr.writerow(header3)
	wtr.writerow(header4)

	for row in rdr:
		listRow = []
		for field in row:
			fieldStr = str(field) # cast as string, just so there's no funny business
			try:
				# Handles negative numbers
				if fieldStr[0] == "(":
					fieldStr = fieldStr.replace('(','-').replace(')', '').replace(',', '')
					listRow.append(fieldStr)
					continue

			# Uncomment the below and modify as necessary if you want to change date formatting
				# elif re.search(r'\d\d-\d\d-\d\d',fieldStr):#	Find dates and change formatting
				# 	fieldStr = fieldStr.replace('-', '/')
				# 	listRow.append(fieldStr)
				# 	continue

				# Handle commas in remaining fields
				else:
					try:
						# if you remove commas from a string and are able to convert to float...
						fieldStr_test = fieldStr.replace(',', '')
						fieldStr_float = float(fieldStr_test)
						# then it is definitely a positive number, so remove the comma.
						fieldStr = fieldStr.replace(',', '')
						listRow.append(fieldStr)
						continue
					except: # If the 'try' block fails, it's a memo, not a number, so leave any commas
						listRow.append(fieldStr)
						continue
			except: # If the `try` block fails, it's a blank/empty string
				listRow.append(fieldStr)
				continue
		wtr.writerow(listRow)

os.remove(stage1)
os.remove(stage2)

print("Input file ready")