From 918bc6aa0c98258cdddbdbe9f0d55c8449bb8bf7 Mon Sep 17 00:00:00 2001 From: Ryan Lim Date: Wed, 28 Aug 2019 23:30:36 -0400 Subject: [PATCH 1/2] added main file --- fix_csv/__main__.py | 4 ++ fix_csv/fix_csv.py | 153 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 157 insertions(+) create mode 100644 fix_csv/__main__.py create mode 100644 fix_csv/fix_csv.py diff --git a/fix_csv/__main__.py b/fix_csv/__main__.py new file mode 100644 index 0000000..d1d597b --- /dev/null +++ b/fix_csv/__main__.py @@ -0,0 +1,4 @@ +from .fix_csv import main + +if __name__ == "__main__": + main() diff --git a/fix_csv/fix_csv.py b/fix_csv/fix_csv.py new file mode 100644 index 0000000..cf5b57a --- /dev/null +++ b/fix_csv/fix_csv.py @@ -0,0 +1,153 @@ +"""Fix csv module""" +import argparse +from typing import List +from difflib import SequenceMatcher +import pandas as pd + + +class FixCSV: + """ Class to help fixing csv columns with the closest valid input""" + + def __init__( + self, + delimiter: str = ",", + auto_fix: bool = True, + quiet: bool = False, + csv_path: str = None, + ): + """initialize class""" + self.possible_values = [] + self.delimiter = delimiter + self.auto_fix = auto_fix + self.quiet = quiet + self.fixed_csv = None + self.csv_path = csv_path + + def set_possible_values(self, possible_values: List[str]): + """set all possible values that the column could be""" + self.possible_values = possible_values + + def add_possible_value(self, possible_value: str): + """add a possible value to list""" + self.possible_values.append(possible_value) + + def get_possible_values(self) -> List[str]: + """return all possible values""" + return self.possible_values + + def fix_csv_column(self, column: int, csv_path: str = None) -> List[str]: + """read in csv and fix column""" + if csv_path: + csv = pd.read_csv(csv_path, delimiter=self.delimiter, header=None) + else: + print(self.csv_path) + csv = pd.read_csv(self.csv_path, delimiter=self.delimiter, header=None) + + fixed_column = [self.edit_string(val) for val in list(csv.iloc[:, column])] + csv.iloc[:, column] = pd.Series(fixed_column) + self.fixed_csv = csv + return fixed_column + + def edit_string(self, string_to_validate: str) -> str: + """get ratios and fix string""" + if string_to_validate in self.possible_values: + return string_to_validate + ratios = self.get_ratios(string_to_validate) + if self.auto_fix: + fixed_val = self.autofix_string(ratios, string_to_validate) + else: + fixed_val = self.manual_fix_string(ratios, string_to_validate) + return fixed_val + + def get_ratios(self, string_to_validate: str) -> List[float]: + """for each possible valid value, return the distance from the input string""" + return [ + SequenceMatcher(None, val, string_to_validate).ratio() + for val in self.possible_values + ] + + def manual_fix_string(self, ratios, string_to_validate: str): + """manually fix the string with inputs""" + # https://stackoverflow.com/questions/13070461/get-index-of-the-top-n-values-of-a-list-in-python + print("\n") + sorted_ratios = sorted( + range(len(ratios)), key=lambda i: ratios[i], reverse=True + ) + print(string_to_validate) + for i in range(1, 4): + print( + "[{}]: {} - {}".format( + i, + self.possible_values[sorted_ratios[i - 1]], + ratios[sorted_ratios[i - 1]], + ) + ) + print("[4]: manual input") + print("[5]: do not modify \n") + valid_input = False + while not valid_input: + val = input("Top 3 most similar values: select an option: ") + try: + selection = int(val) + if 0 < selection < 6: + valid_input = True + if selection == 5: + selected_value = string_to_validate + elif selection == 4: + selected_value = input( + "Write the input you want to replace the string with: " + ) + else: + selected_value = self.possible_values[ + sorted_ratios[selection - 1] + ] + print("You have selected: ", selected_value) + + else: + print("You have entered a number that is not an option") + except TypeError: + print("You have entered an invalid response") + + return selected_value + + def autofix_string(self, ratios, string_to_validate): + max_ind = [ind for ind, val in enumerate(ratios) if val == max(ratios)] + if len(max_ind) > 1: + print("Warning: more than one values with max value") + fixed_val = self.possible_values[max_ind.pop()] + if not self.quiet: + print(string_to_validate, "->", fixed_val) + return fixed_val + + def read_possible_values(self, possible_values: str): + with open(possible_values, "r") as file: + val = [line.strip() for line in file] + self.set_possible_values(val) + + +def main(): + parser = argparse.ArgumentParser( + "automatically fixes csv columns with acceptable values" + ) + parser.add_argument("csv_path", nargs=1) + parser.add_argument("column", nargs=1) + parser.add_argument("--possible_values", nargs="*") + parser.add_argument("--output", nargs="?") + parser.add_argument("--no-auto-fix", action="store_true") + parser.add_argument("--quiet", action="store_true") + args = parser.parse_args() + + fcsv = FixCSV(csv_path=args.csv_path[0]) + fcsv.set_possible_values(args.possible_values) + fcsv.fix_csv_column(column=int(args.column[0])) + if args.output is not None: + fcsv.fixed_csv.to_csv( + args.output, index=None, header=None + ) + else: + for _, row in fcsv.fixed_csv.iterrows(): + print(row) + + +if __name__ == "__main__": + main() From 687edbc0f3fb0ebc7faa731ea76521a08aeaac4c Mon Sep 17 00:00:00 2001 From: Ryan Lim Date: Sat, 31 Aug 2019 14:40:05 -0400 Subject: [PATCH 2/2] added setup file --- .gitignore | 3 + __init__.py | 3 + fix_csv.py | 119 -------------------------- fix_csv/fix_csv.py | 2 +- setup.py | 8 ++ test_fixcsv/data/demo_header.csv | 11 +++ test_fixcsv/data/demo_noheader.csv | 10 +++ test_fixcsv/data/demodata.csv | 11 +++ test_fixcsv/data/getdata.py | 7 ++ test_fixcsv/data/testdata.csv | 9 ++ {test => test_fixcsv}/manual_test.py | 4 +- test_fixcsv/test_csv.py | 13 +++ {test => test_fixcsv}/test_fix_csv.py | 9 +- 13 files changed, 81 insertions(+), 128 deletions(-) create mode 100644 __init__.py delete mode 100644 fix_csv.py create mode 100644 setup.py create mode 100644 test_fixcsv/data/demo_header.csv create mode 100644 test_fixcsv/data/demo_noheader.csv create mode 100644 test_fixcsv/data/demodata.csv create mode 100644 test_fixcsv/data/getdata.py create mode 100644 test_fixcsv/data/testdata.csv rename {test => test_fixcsv}/manual_test.py (87%) create mode 100644 test_fixcsv/test_csv.py rename {test => test_fixcsv}/test_fix_csv.py (81%) diff --git a/.gitignore b/.gitignore index 48b52a4..9b18598 100644 --- a/.gitignore +++ b/.gitignore @@ -2,3 +2,6 @@ venv .idea __pycache* DKEFSFAKEDATA.csv +build/ +dist/ +*egg* diff --git a/__init__.py b/__init__.py new file mode 100644 index 0000000..e9feecc --- /dev/null +++ b/__init__.py @@ -0,0 +1,3 @@ +import fix_csv.fix_csv +import test_fixcsv.test_fix_csv +import test_fixcsv.data.getdata \ No newline at end of file diff --git a/fix_csv.py b/fix_csv.py deleted file mode 100644 index 3155858..0000000 --- a/fix_csv.py +++ /dev/null @@ -1,119 +0,0 @@ -import sys -from typing import List -import pandas as pd -from difflib import SequenceMatcher - - -class FixCSV: - def __init__( - self, delimiter: str = ",", autofix: bool = False, quiet: bool = False - ): - self.possible_values = [] - self.delimiter = delimiter - self.autofix = autofix - self.quiet = quiet - self.fixed_csv = None - - def set_possible_values(self, possible_values: List[str]): - self.possible_values = possible_values - - def add_possible_value(self, possible_value: str): - self.possible_values.append(possible_value) - - def get_possible_values(self) -> List[str]: - return self.possible_values - - def fix_csv_column(self, csv_path: str, column: int) -> List[str]: - csv = pd.read_csv(csv_path, delimiter=self.delimiter, header=None) - csv_column = list(csv.iloc[:, column]) - fixed_column = [self.validate_string(val) for val in csv_column] - csv.iloc[:, column] = pd.Series(fixed_column) - self.fixed_csv = csv - return fixed_column - - def validate_string(self, string_to_validate: str): - if string_to_validate in self.possible_values: - return string_to_validate - else: - fixed_val = self.edit_string(string_to_validate) - return fixed_val - - def edit_string(self, string_to_validate): - ratios = [ - SequenceMatcher(None, val, string_to_validate).ratio() - for val in self.possible_values - ] - if self.autofix: - fixed_val = self.autofix_string(ratios, string_to_validate) - else: - fixed_val = self.manual_fix_string(ratios, string_to_validate) - return fixed_val - - def manual_fix_string(self, ratios, string_to_validate: str): - # via https://stackoverflow.com/questions/13070461/get-index-of-the-top-n-values-of-a-list-in-python - print("\n") - sorted_ratios = sorted( - range(len(ratios)), key=lambda i: ratios[i], reverse=True - ) - print(string_to_validate) - for i in range(1, 4): - print( - "[{}]: {} - {}".format( - i, - self.possible_values[sorted_ratios[i - 1]], - ratios[sorted_ratios[i - 1]], - ) - ) - print("[4]: manual input") - print("[5]: do not modify \n") - valid_input = False - while not valid_input: - val = input("Top 3 most similar values: select an option: ") - try: - selection = int(val) - if selection >= 1 and selection <= 5: - valid_input = True - if selection == 5: - selected_value = string_to_validate - elif selection == 4: - selected_value = input( - "Write the input you want to replace the string with: " - ) - else: - selected_value = self.possible_values[ - sorted_ratios[selection - 1] - ] - print("You have selected: ", selected_value) - - else: - print("You have entered a number that is not an option") - except: - print("You have entered an invalid response") - - return selected_value - - def autofix_string(self, ratios, string_to_validate): - max_ind = [ind for ind, val in enumerate(ratios) if val == max(ratios)] - if len(max_ind) > 1: - print("Warning: more than one values with max value") - fixed_val = self.possible_values[max_ind.pop()] - if not self.quiet: - print(string_to_validate, "->", fixed_val) - return fixed_val - - -if __name__ == "__main__": - if len(sys.argv) < 4: - print("Not enough arguments") - exit(1) - - csv_path = sys.argv[1] - column = int(sys.argv[2]) - path_to_possible_values = sys.argv[3] - with open(path_to_possible_values, "r") as f: - val = [line.strip() for line in f] - fcsv = FixCSV() - fcsv.set_possible_values(val) - fixed_column = fcsv.fix_csv_column(csv_path, column) - if fcsv.fixed_csv is not None: - fcsv.fixed_csv.to_csv(csv_path.replace(".", "_fix."), index=None, header=None) diff --git a/fix_csv/fix_csv.py b/fix_csv/fix_csv.py index cf5b57a..3801a7b 100644 --- a/fix_csv/fix_csv.py +++ b/fix_csv/fix_csv.py @@ -67,7 +67,7 @@ def get_ratios(self, string_to_validate: str) -> List[float]: ] def manual_fix_string(self, ratios, string_to_validate: str): - """manually fix the string with inputs""" + """manually fix the string with bash inputs""" # https://stackoverflow.com/questions/13070461/get-index-of-the-top-n-values-of-a-list-in-python print("\n") sorted_ratios = sorted( diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..0e3cbbb --- /dev/null +++ b/setup.py @@ -0,0 +1,8 @@ +from setuptools import setup + +setup( + name="FixCsv", + version="0.0.1", + packages=["fix_csv"], + entry_points={"console_scripts": ["fix_csv = fix_csv.__main__:main"]}, +) diff --git a/test_fixcsv/data/demo_header.csv b/test_fixcsv/data/demo_header.csv new file mode 100644 index 0000000..88e793a --- /dev/null +++ b/test_fixcsv/data/demo_header.csv @@ -0,0 +1,11 @@ +clientid,weekdays,up +0,Wed,False +1,Thu,True +2,Fri,False +3,Sat,False +4,Sun,True +5,Mon,False +6,Tue,True +7,Wed,True +8,Thu,False +9,Fri,False diff --git a/test_fixcsv/data/demo_noheader.csv b/test_fixcsv/data/demo_noheader.csv new file mode 100644 index 0000000..28981cd --- /dev/null +++ b/test_fixcsv/data/demo_noheader.csv @@ -0,0 +1,10 @@ +0,Wed,False +1,Thu,True +2,Fri,False +3,Sat,False +4,Sun,True +5,Mon,False +6,Tue,True +7,Wed,True +8,Thu,False +9,Fri,False diff --git a/test_fixcsv/data/demodata.csv b/test_fixcsv/data/demodata.csv new file mode 100644 index 0000000..c167c4c --- /dev/null +++ b/test_fixcsv/data/demodata.csv @@ -0,0 +1,11 @@ +clientid,date,weekdays,gains,prices,up +0,2008-04-30,Wed,-0.52458192906686452,7791404.0091921333,False +1,2008-05-01,Thu,0.076191536201738269,3167180.7366340165,True +2,2008-05-02,Fri,-0.86850970062880861,9589766.9613829032,False +3,2008-05-03,Sat,-0.42701083852713395,8949415.1867596991,False +4,2008-05-04,Sun,0.2532553652693274,937163.44375252665,True +5,2008-05-05,Mon,-0.68151636911081892,949579.88022264629,False +6,2008-05-06,Tue,0.0071911579626532168,7268426.906552773,True +7,2008-05-07,Wed,0.67449747200412147,7517014.782897247,True +8,2008-05-08,Thu,-1.1841008656818983,1920959.5423492221,False +9,2008-05-09,Fri,-1.5803692595811152,8456240.6198725495,False diff --git a/test_fixcsv/data/getdata.py b/test_fixcsv/data/getdata.py new file mode 100644 index 0000000..fa2d628 --- /dev/null +++ b/test_fixcsv/data/getdata.py @@ -0,0 +1,7 @@ +import os + +FILEPATH = os.path.dirname(os.path.abspath(__file__)) + + +def get_test_data(): + return os.path.join(FILEPATH, "testdata.csv") diff --git a/test_fixcsv/data/testdata.csv b/test_fixcsv/data/testdata.csv new file mode 100644 index 0000000..ff58177 --- /dev/null +++ b/test_fixcsv/data/testdata.csv @@ -0,0 +1,9 @@ +test +test +test +test +tset +terst +test +test +test \ No newline at end of file diff --git a/test/manual_test.py b/test_fixcsv/manual_test.py similarity index 87% rename from test/manual_test.py rename to test_fixcsv/manual_test.py index 21cbc39..5eeaf63 100644 --- a/test/manual_test.py +++ b/test_fixcsv/manual_test.py @@ -2,11 +2,11 @@ import os sys.path.append(os.path.abspath(sys.path[0]) + "/../") -from fix_csv import FixCSV +from fix_csv.fix_csv import FixCSV def main(): - fcsv = FixCSV(autofix=False) + fcsv = FixCSV(auto_fix=False) possible_values = [ "ACGA", "ACGA_FU2", diff --git a/test_fixcsv/test_csv.py b/test_fixcsv/test_csv.py new file mode 100644 index 0000000..cd80231 --- /dev/null +++ b/test_fixcsv/test_csv.py @@ -0,0 +1,13 @@ +import unittest +from test_fixcsv.data.getdata import get_test_data +from fix_csv.fix_csv import FixCSV + + +class TestCSV(unittest.TestCase): + def test_read_csv(self): + data_path = get_test_data() + fc = FixCSV(csv_path=data_path) + fc.set_possible_values(["test"]) + vals = fc.fix_csv_column(0) + self.assertEqual("test", vals.pop()) + diff --git a/test/test_fix_csv.py b/test_fixcsv/test_fix_csv.py similarity index 81% rename from test/test_fix_csv.py rename to test_fixcsv/test_fix_csv.py index b9080b0..0e4345c 100644 --- a/test/test_fix_csv.py +++ b/test_fixcsv/test_fix_csv.py @@ -1,13 +1,10 @@ import unittest -import sys, os - -sys.path.append(os.path.abspath(sys.path[0]) + "/../") -from fix_csv import FixCSV +from fix_csv.fix_csv import FixCSV class TestFixCSV(unittest.TestCase): def setUp(self): - self.fcsv = FixCSV(autofix=True, quiet=True) + self.fcsv = FixCSV(auto_fix=True, quiet=True) def test_add_possible_value(self): self.fcsv.add_possible_value("possible") @@ -34,7 +31,7 @@ def test_dkefs_fake_data(self): "CTGV3", ] self.fcsv.set_possible_values(possible_values) - fixed_column = self.fcsv.fix_csv_column("DKEFSFAKEDATA.csv", 9) + fixed_column = self.fcsv.fix_csv_column(9, "data/DKEFSFAKEDATA.csv") self.assertLess(len(set(fixed_column)), len(possible_values))