Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,6 @@ venv
.idea
__pycache*
DKEFSFAKEDATA.csv
build/
dist/
*egg*
3 changes: 3 additions & 0 deletions __init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
import fix_csv.fix_csv
import test_fixcsv.test_fix_csv
import test_fixcsv.data.getdata
4 changes: 4 additions & 0 deletions fix_csv/__main__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
from .fix_csv import main

if __name__ == "__main__":
main()
102 changes: 68 additions & 34 deletions fix_csv.py → fix_csv/fix_csv.py
Original file line number Diff line number Diff line change
@@ -1,56 +1,74 @@
import sys
"""Fix csv module"""
import argparse
from typing import List
import pandas as pd
from difflib import SequenceMatcher
import pandas as pd


class FixCSV:
""" Class to help fixing csv columns with the closest valid input"""

def __init__(
self, delimiter: str = ",", autofix: bool = False, quiet: bool = False
self,
delimiter: str = ",",
auto_fix: bool = True,
quiet: bool = False,
csv_path: str = None,
):
"""initialize class"""
self.possible_values = []
self.delimiter = delimiter
self.autofix = autofix
self.auto_fix = auto_fix
self.quiet = quiet
self.fixed_csv = None
self.csv_path = csv_path

def set_possible_values(self, possible_values: List[str]):
"""set all possible values that the column could be"""
self.possible_values = possible_values

def add_possible_value(self, possible_value: str):
"""add a possible value to list"""
self.possible_values.append(possible_value)

def get_possible_values(self) -> List[str]:
"""return all possible values"""
return self.possible_values

def fix_csv_column(self, csv_path: str, column: int) -> List[str]:
csv = pd.read_csv(csv_path, delimiter=self.delimiter, header=None)
csv_column = list(csv.iloc[:, column])
fixed_column = [self.validate_string(val) for val in csv_column]
def fix_csv_column(self, column: int, csv_path: str = None) -> List[str]:
"""read in csv and fix column"""
if csv_path:
csv = pd.read_csv(csv_path, delimiter=self.delimiter, header=None)
else:
print(self.csv_path)
csv = pd.read_csv(self.csv_path, delimiter=self.delimiter, header=None)

fixed_column = [self.edit_string(val) for val in list(csv.iloc[:, column])]
csv.iloc[:, column] = pd.Series(fixed_column)
self.fixed_csv = csv
return fixed_column

def validate_string(self, string_to_validate: str):
def edit_string(self, string_to_validate: str) -> str:
"""get ratios and fix string"""
if string_to_validate in self.possible_values:
return string_to_validate
ratios = self.get_ratios(string_to_validate)
if self.auto_fix:
fixed_val = self.autofix_string(ratios, string_to_validate)
else:
fixed_val = self.edit_string(string_to_validate)
fixed_val = self.manual_fix_string(ratios, string_to_validate)
return fixed_val

def edit_string(self, string_to_validate):
ratios = [
def get_ratios(self, string_to_validate: str) -> List[float]:
"""for each possible valid value, return the distance from the input string"""
return [
SequenceMatcher(None, val, string_to_validate).ratio()
for val in self.possible_values
]
if self.autofix:
fixed_val = self.autofix_string(ratios, string_to_validate)
else:
fixed_val = self.manual_fix_string(ratios, string_to_validate)
return fixed_val

def manual_fix_string(self, ratios, string_to_validate: str):
# via https://stackoverflow.com/questions/13070461/get-index-of-the-top-n-values-of-a-list-in-python
"""manually fix the string with bash inputs"""
# https://stackoverflow.com/questions/13070461/get-index-of-the-top-n-values-of-a-list-in-python
print("\n")
sorted_ratios = sorted(
range(len(ratios)), key=lambda i: ratios[i], reverse=True
Expand All @@ -71,7 +89,7 @@ def manual_fix_string(self, ratios, string_to_validate: str):
val = input("Top 3 most similar values: select an option: ")
try:
selection = int(val)
if selection >= 1 and selection <= 5:
if 0 < selection < 6:
valid_input = True
if selection == 5:
selected_value = string_to_validate
Expand All @@ -87,7 +105,7 @@ def manual_fix_string(self, ratios, string_to_validate: str):

else:
print("You have entered a number that is not an option")
except:
except TypeError:
print("You have entered an invalid response")

return selected_value
Expand All @@ -101,19 +119,35 @@ def autofix_string(self, ratios, string_to_validate):
print(string_to_validate, "->", fixed_val)
return fixed_val

def read_possible_values(self, possible_values: str):
with open(possible_values, "r") as file:
val = [line.strip() for line in file]
self.set_possible_values(val)


def main():
parser = argparse.ArgumentParser(
"automatically fixes csv columns with acceptable values"
)
parser.add_argument("csv_path", nargs=1)
parser.add_argument("column", nargs=1)
parser.add_argument("--possible_values", nargs="*")
parser.add_argument("--output", nargs="?")
parser.add_argument("--no-auto-fix", action="store_true")
parser.add_argument("--quiet", action="store_true")
args = parser.parse_args()

fcsv = FixCSV(csv_path=args.csv_path[0])
fcsv.set_possible_values(args.possible_values)
fcsv.fix_csv_column(column=int(args.column[0]))
if args.output is not None:
fcsv.fixed_csv.to_csv(
args.output, index=None, header=None
)
else:
for _, row in fcsv.fixed_csv.iterrows():
print(row)


if __name__ == "__main__":
if len(sys.argv) < 4:
print("Not enough arguments")
exit(1)

csv_path = sys.argv[1]
column = int(sys.argv[2])
path_to_possible_values = sys.argv[3]
with open(path_to_possible_values, "r") as f:
val = [line.strip() for line in f]
fcsv = FixCSV()
fcsv.set_possible_values(val)
fixed_column = fcsv.fix_csv_column(csv_path, column)
if fcsv.fixed_csv is not None:
fcsv.fixed_csv.to_csv(csv_path.replace(".", "_fix."), index=None, header=None)
main()
8 changes: 8 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
from setuptools import setup

setup(
name="FixCsv",
version="0.0.1",
packages=["fix_csv"],
entry_points={"console_scripts": ["fix_csv = fix_csv.__main__:main"]},
)
11 changes: 11 additions & 0 deletions test_fixcsv/data/demo_header.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
clientid,weekdays,up
0,Wed,False
1,Thu,True
2,Fri,False
3,Sat,False
4,Sun,True
5,Mon,False
6,Tue,True
7,Wed,True
8,Thu,False
9,Fri,False
10 changes: 10 additions & 0 deletions test_fixcsv/data/demo_noheader.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
0,Wed,False
1,Thu,True
2,Fri,False
3,Sat,False
4,Sun,True
5,Mon,False
6,Tue,True
7,Wed,True
8,Thu,False
9,Fri,False
11 changes: 11 additions & 0 deletions test_fixcsv/data/demodata.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
clientid,date,weekdays,gains,prices,up
0,2008-04-30,Wed,-0.52458192906686452,7791404.0091921333,False
1,2008-05-01,Thu,0.076191536201738269,3167180.7366340165,True
2,2008-05-02,Fri,-0.86850970062880861,9589766.9613829032,False
3,2008-05-03,Sat,-0.42701083852713395,8949415.1867596991,False
4,2008-05-04,Sun,0.2532553652693274,937163.44375252665,True
5,2008-05-05,Mon,-0.68151636911081892,949579.88022264629,False
6,2008-05-06,Tue,0.0071911579626532168,7268426.906552773,True
7,2008-05-07,Wed,0.67449747200412147,7517014.782897247,True
8,2008-05-08,Thu,-1.1841008656818983,1920959.5423492221,False
9,2008-05-09,Fri,-1.5803692595811152,8456240.6198725495,False
7 changes: 7 additions & 0 deletions test_fixcsv/data/getdata.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
import os

FILEPATH = os.path.dirname(os.path.abspath(__file__))


def get_test_data():
return os.path.join(FILEPATH, "testdata.csv")
9 changes: 9 additions & 0 deletions test_fixcsv/data/testdata.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
test
test
test
test
tset
terst
test
test
test
4 changes: 2 additions & 2 deletions test/manual_test.py → test_fixcsv/manual_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,11 @@
import os

sys.path.append(os.path.abspath(sys.path[0]) + "/../")
from fix_csv import FixCSV
from fix_csv.fix_csv import FixCSV


def main():
fcsv = FixCSV(autofix=False)
fcsv = FixCSV(auto_fix=False)
possible_values = [
"ACGA",
"ACGA_FU2",
Expand Down
13 changes: 13 additions & 0 deletions test_fixcsv/test_csv.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
import unittest
from test_fixcsv.data.getdata import get_test_data
from fix_csv.fix_csv import FixCSV


class TestCSV(unittest.TestCase):
def test_read_csv(self):
data_path = get_test_data()
fc = FixCSV(csv_path=data_path)
fc.set_possible_values(["test"])
vals = fc.fix_csv_column(0)
self.assertEqual("test", vals.pop())

9 changes: 3 additions & 6 deletions test/test_fix_csv.py → test_fixcsv/test_fix_csv.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,10 @@
import unittest
import sys, os

sys.path.append(os.path.abspath(sys.path[0]) + "/../")
from fix_csv import FixCSV
from fix_csv.fix_csv import FixCSV


class TestFixCSV(unittest.TestCase):
def setUp(self):
self.fcsv = FixCSV(autofix=True, quiet=True)
self.fcsv = FixCSV(auto_fix=True, quiet=True)

def test_add_possible_value(self):
self.fcsv.add_possible_value("possible")
Expand All @@ -34,7 +31,7 @@ def test_dkefs_fake_data(self):
"CTGV3",
]
self.fcsv.set_possible_values(possible_values)
fixed_column = self.fcsv.fix_csv_column("DKEFSFAKEDATA.csv", 9)
fixed_column = self.fcsv.fix_csv_column(9, "data/DKEFSFAKEDATA.csv")
self.assertLess(len(set(fixed_column)), len(possible_values))


Expand Down