From 918bc6aa0c98258cdddbdbe9f0d55c8449bb8bf7 Mon Sep 17 00:00:00 2001
From: Ryan Lim <rzlim08@gmail.com>
Date: Wed, 28 Aug 2019 23:30:36 -0400
Subject: [PATCH 1/2] added main file

---
 fix_csv/__main__.py |   4 ++
 fix_csv/fix_csv.py  | 153 ++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 157 insertions(+)
 create mode 100644 fix_csv/__main__.py
 create mode 100644 fix_csv/fix_csv.py

diff --git a/fix_csv/__main__.py b/fix_csv/__main__.py
new file mode 100644
index 0000000..d1d597b
--- /dev/null
+++ b/fix_csv/__main__.py
@@ -0,0 +1,4 @@
+from .fix_csv import main
+
+if __name__ == "__main__":
+    main()
diff --git a/fix_csv/fix_csv.py b/fix_csv/fix_csv.py
new file mode 100644
index 0000000..cf5b57a
--- /dev/null
+++ b/fix_csv/fix_csv.py
@@ -0,0 +1,153 @@
+"""Fix csv module"""
+import argparse
+from typing import List
+from difflib import SequenceMatcher
+import pandas as pd
+
+
+class FixCSV:
+    """ Class to help fixing csv columns with the closest valid input"""
+
+    def __init__(
+        self,
+        delimiter: str = ",",
+        auto_fix: bool = True,
+        quiet: bool = False,
+        csv_path: str = None,
+    ):
+        """initialize class"""
+        self.possible_values = []
+        self.delimiter = delimiter
+        self.auto_fix = auto_fix
+        self.quiet = quiet
+        self.fixed_csv = None
+        self.csv_path = csv_path
+
+    def set_possible_values(self, possible_values: List[str]):
+        """set all possible values that the column could be"""
+        self.possible_values = possible_values
+
+    def add_possible_value(self, possible_value: str):
+        """add a possible value to list"""
+        self.possible_values.append(possible_value)
+
+    def get_possible_values(self) -> List[str]:
+        """return all possible values"""
+        return self.possible_values
+
+    def fix_csv_column(self, column: int, csv_path: str = None) -> List[str]:
+        """read in csv and fix column"""
+        if csv_path:
+            csv = pd.read_csv(csv_path, delimiter=self.delimiter, header=None)
+        else:
+            print(self.csv_path)
+            csv = pd.read_csv(self.csv_path, delimiter=self.delimiter, header=None)
+
+        fixed_column = [self.edit_string(val) for val in list(csv.iloc[:, column])]
+        csv.iloc[:, column] = pd.Series(fixed_column)
+        self.fixed_csv = csv
+        return fixed_column
+
+    def edit_string(self, string_to_validate: str) -> str:
+        """get ratios and fix string"""
+        if string_to_validate in self.possible_values:
+            return string_to_validate
+        ratios = self.get_ratios(string_to_validate)
+        if self.auto_fix:
+            fixed_val = self.autofix_string(ratios, string_to_validate)
+        else:
+            fixed_val = self.manual_fix_string(ratios, string_to_validate)
+        return fixed_val
+
+    def get_ratios(self, string_to_validate: str) -> List[float]:
+        """for each possible valid value, return the distance from the input string"""
+        return [
+            SequenceMatcher(None, val, string_to_validate).ratio()
+            for val in self.possible_values
+        ]
+
+    def manual_fix_string(self, ratios, string_to_validate: str):
+        """manually fix the string with inputs"""
+        # https://stackoverflow.com/questions/13070461/get-index-of-the-top-n-values-of-a-list-in-python
+        print("\n")
+        sorted_ratios = sorted(
+            range(len(ratios)), key=lambda i: ratios[i], reverse=True
+        )
+        print(string_to_validate)
+        for i in range(1, 4):
+            print(
+                "[{}]: {} - {}".format(
+                    i,
+                    self.possible_values[sorted_ratios[i - 1]],
+                    ratios[sorted_ratios[i - 1]],
+                )
+            )
+        print("[4]: manual input")
+        print("[5]: do not modify \n")
+        valid_input = False
+        while not valid_input:
+            val = input("Top 3 most similar values: select an option: ")
+            try:
+                selection = int(val)
+                if 0 < selection < 6:
+                    valid_input = True
+                    if selection == 5:
+                        selected_value = string_to_validate
+                    elif selection == 4:
+                        selected_value = input(
+                            "Write the input you want to replace the string with: "
+                        )
+                    else:
+                        selected_value = self.possible_values[
+                            sorted_ratios[selection - 1]
+                        ]
+                    print("You have selected: ", selected_value)
+
+                else:
+                    print("You have entered a number that is not an option")
+            except TypeError:
+                print("You have entered an invalid response")
+
+        return selected_value
+
+    def autofix_string(self, ratios, string_to_validate):
+        max_ind = [ind for ind, val in enumerate(ratios) if val == max(ratios)]
+        if len(max_ind) > 1:
+            print("Warning: more than one values with max value")
+        fixed_val = self.possible_values[max_ind.pop()]
+        if not self.quiet:
+            print(string_to_validate, "->", fixed_val)
+        return fixed_val
+
+    def read_possible_values(self, possible_values: str):
+        with open(possible_values, "r") as file:
+            val = [line.strip() for line in file]
+        self.set_possible_values(val)
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        "automatically fixes csv columns with acceptable values"
+    )
+    parser.add_argument("csv_path", nargs=1)
+    parser.add_argument("column", nargs=1)
+    parser.add_argument("--possible_values", nargs="*")
+    parser.add_argument("--output", nargs="?")
+    parser.add_argument("--no-auto-fix", action="store_true")
+    parser.add_argument("--quiet", action="store_true")
+    args = parser.parse_args()
+
+    fcsv = FixCSV(csv_path=args.csv_path[0])
+    fcsv.set_possible_values(args.possible_values)
+    fcsv.fix_csv_column(column=int(args.column[0]))
+    if args.output is not None:
+        fcsv.fixed_csv.to_csv(
+            args.output, index=None, header=None
+        )
+    else:
+        for _, row in fcsv.fixed_csv.iterrows():
+            print(row)
+
+
+if __name__ == "__main__":
+    main()

From 687edbc0f3fb0ebc7faa731ea76521a08aeaac4c Mon Sep 17 00:00:00 2001
From: Ryan Lim <rzlim08@gmail.com>
Date: Sat, 31 Aug 2019 14:40:05 -0400
Subject: [PATCH 2/2] added setup file

---
 .gitignore                            |   3 +
 __init__.py                           |   3 +
 fix_csv.py                            | 119 --------------------------
 fix_csv/fix_csv.py                    |   2 +-
 setup.py                              |   8 ++
 test_fixcsv/data/demo_header.csv      |  11 +++
 test_fixcsv/data/demo_noheader.csv    |  10 +++
 test_fixcsv/data/demodata.csv         |  11 +++
 test_fixcsv/data/getdata.py           |   7 ++
 test_fixcsv/data/testdata.csv         |   9 ++
 {test => test_fixcsv}/manual_test.py  |   4 +-
 test_fixcsv/test_csv.py               |  13 +++
 {test => test_fixcsv}/test_fix_csv.py |   9 +-
 13 files changed, 81 insertions(+), 128 deletions(-)
 create mode 100644 __init__.py
 delete mode 100644 fix_csv.py
 create mode 100644 setup.py
 create mode 100644 test_fixcsv/data/demo_header.csv
 create mode 100644 test_fixcsv/data/demo_noheader.csv
 create mode 100644 test_fixcsv/data/demodata.csv
 create mode 100644 test_fixcsv/data/getdata.py
 create mode 100644 test_fixcsv/data/testdata.csv
 rename {test => test_fixcsv}/manual_test.py (87%)
 create mode 100644 test_fixcsv/test_csv.py
 rename {test => test_fixcsv}/test_fix_csv.py (81%)

diff --git a/.gitignore b/.gitignore
index 48b52a4..9b18598 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,3 +2,6 @@ venv
 .idea
 __pycache*
 DKEFSFAKEDATA.csv
+build/
+dist/
+*egg*
diff --git a/__init__.py b/__init__.py
new file mode 100644
index 0000000..e9feecc
--- /dev/null
+++ b/__init__.py
@@ -0,0 +1,3 @@
+import fix_csv.fix_csv
+import test_fixcsv.test_fix_csv
+import test_fixcsv.data.getdata
\ No newline at end of file
diff --git a/fix_csv.py b/fix_csv.py
deleted file mode 100644
index 3155858..0000000
--- a/fix_csv.py
+++ /dev/null
@@ -1,119 +0,0 @@
-import sys
-from typing import List
-import pandas as pd
-from difflib import SequenceMatcher
-
-
-class FixCSV:
-    def __init__(
-        self, delimiter: str = ",", autofix: bool = False, quiet: bool = False
-    ):
-        self.possible_values = []
-        self.delimiter = delimiter
-        self.autofix = autofix
-        self.quiet = quiet
-        self.fixed_csv = None
-
-    def set_possible_values(self, possible_values: List[str]):
-        self.possible_values = possible_values
-
-    def add_possible_value(self, possible_value: str):
-        self.possible_values.append(possible_value)
-
-    def get_possible_values(self) -> List[str]:
-        return self.possible_values
-
-    def fix_csv_column(self, csv_path: str, column: int) -> List[str]:
-        csv = pd.read_csv(csv_path, delimiter=self.delimiter, header=None)
-        csv_column = list(csv.iloc[:, column])
-        fixed_column = [self.validate_string(val) for val in csv_column]
-        csv.iloc[:, column] = pd.Series(fixed_column)
-        self.fixed_csv = csv
-        return fixed_column
-
-    def validate_string(self, string_to_validate: str):
-        if string_to_validate in self.possible_values:
-            return string_to_validate
-        else:
-            fixed_val = self.edit_string(string_to_validate)
-        return fixed_val
-
-    def edit_string(self, string_to_validate):
-        ratios = [
-            SequenceMatcher(None, val, string_to_validate).ratio()
-            for val in self.possible_values
-        ]
-        if self.autofix:
-            fixed_val = self.autofix_string(ratios, string_to_validate)
-        else:
-            fixed_val = self.manual_fix_string(ratios, string_to_validate)
-        return fixed_val
-
-    def manual_fix_string(self, ratios, string_to_validate: str):
-        # via https://stackoverflow.com/questions/13070461/get-index-of-the-top-n-values-of-a-list-in-python
-        print("\n")
-        sorted_ratios = sorted(
-            range(len(ratios)), key=lambda i: ratios[i], reverse=True
-        )
-        print(string_to_validate)
-        for i in range(1, 4):
-            print(
-                "[{}]: {} - {}".format(
-                    i,
-                    self.possible_values[sorted_ratios[i - 1]],
-                    ratios[sorted_ratios[i - 1]],
-                )
-            )
-        print("[4]: manual input")
-        print("[5]: do not modify \n")
-        valid_input = False
-        while not valid_input:
-            val = input("Top 3 most similar values: select an option: ")
-            try:
-                selection = int(val)
-                if selection >= 1 and selection <= 5:
-                    valid_input = True
-                    if selection == 5:
-                        selected_value = string_to_validate
-                    elif selection == 4:
-                        selected_value = input(
-                            "Write the input you want to replace the string with: "
-                        )
-                    else:
-                        selected_value = self.possible_values[
-                            sorted_ratios[selection - 1]
-                        ]
-                    print("You have selected: ", selected_value)
-
-                else:
-                    print("You have entered a number that is not an option")
-            except:
-                print("You have entered an invalid response")
-
-        return selected_value
-
-    def autofix_string(self, ratios, string_to_validate):
-        max_ind = [ind for ind, val in enumerate(ratios) if val == max(ratios)]
-        if len(max_ind) > 1:
-            print("Warning: more than one values with max value")
-        fixed_val = self.possible_values[max_ind.pop()]
-        if not self.quiet:
-            print(string_to_validate, "->", fixed_val)
-        return fixed_val
-
-
-if __name__ == "__main__":
-    if len(sys.argv) < 4:
-        print("Not enough arguments")
-        exit(1)
-
-    csv_path = sys.argv[1]
-    column = int(sys.argv[2])
-    path_to_possible_values = sys.argv[3]
-    with open(path_to_possible_values, "r") as f:
-        val = [line.strip() for line in f]
-    fcsv = FixCSV()
-    fcsv.set_possible_values(val)
-    fixed_column = fcsv.fix_csv_column(csv_path, column)
-    if fcsv.fixed_csv is not None:
-        fcsv.fixed_csv.to_csv(csv_path.replace(".", "_fix."), index=None, header=None)
diff --git a/fix_csv/fix_csv.py b/fix_csv/fix_csv.py
index cf5b57a..3801a7b 100644
--- a/fix_csv/fix_csv.py
+++ b/fix_csv/fix_csv.py
@@ -67,7 +67,7 @@ def get_ratios(self, string_to_validate: str) -> List[float]:
         ]
 
     def manual_fix_string(self, ratios, string_to_validate: str):
-        """manually fix the string with inputs"""
+        """manually fix the string with bash inputs"""
         # https://stackoverflow.com/questions/13070461/get-index-of-the-top-n-values-of-a-list-in-python
         print("\n")
         sorted_ratios = sorted(
diff --git a/setup.py b/setup.py
new file mode 100644
index 0000000..0e3cbbb
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,8 @@
+from setuptools import setup
+
+setup(
+    name="FixCsv",
+    version="0.0.1",
+    packages=["fix_csv"],
+    entry_points={"console_scripts": ["fix_csv = fix_csv.__main__:main"]},
+)
diff --git a/test_fixcsv/data/demo_header.csv b/test_fixcsv/data/demo_header.csv
new file mode 100644
index 0000000..88e793a
--- /dev/null
+++ b/test_fixcsv/data/demo_header.csv
@@ -0,0 +1,11 @@
+clientid,weekdays,up
+0,Wed,False
+1,Thu,True
+2,Fri,False
+3,Sat,False
+4,Sun,True
+5,Mon,False
+6,Tue,True
+7,Wed,True
+8,Thu,False
+9,Fri,False
diff --git a/test_fixcsv/data/demo_noheader.csv b/test_fixcsv/data/demo_noheader.csv
new file mode 100644
index 0000000..28981cd
--- /dev/null
+++ b/test_fixcsv/data/demo_noheader.csv
@@ -0,0 +1,10 @@
+0,Wed,False
+1,Thu,True
+2,Fri,False
+3,Sat,False
+4,Sun,True
+5,Mon,False
+6,Tue,True
+7,Wed,True
+8,Thu,False
+9,Fri,False
diff --git a/test_fixcsv/data/demodata.csv b/test_fixcsv/data/demodata.csv
new file mode 100644
index 0000000..c167c4c
--- /dev/null
+++ b/test_fixcsv/data/demodata.csv
@@ -0,0 +1,11 @@
+clientid,date,weekdays,gains,prices,up
+0,2008-04-30,Wed,-0.52458192906686452,7791404.0091921333,False
+1,2008-05-01,Thu,0.076191536201738269,3167180.7366340165,True
+2,2008-05-02,Fri,-0.86850970062880861,9589766.9613829032,False
+3,2008-05-03,Sat,-0.42701083852713395,8949415.1867596991,False
+4,2008-05-04,Sun,0.2532553652693274,937163.44375252665,True
+5,2008-05-05,Mon,-0.68151636911081892,949579.88022264629,False
+6,2008-05-06,Tue,0.0071911579626532168,7268426.906552773,True
+7,2008-05-07,Wed,0.67449747200412147,7517014.782897247,True
+8,2008-05-08,Thu,-1.1841008656818983,1920959.5423492221,False
+9,2008-05-09,Fri,-1.5803692595811152,8456240.6198725495,False
diff --git a/test_fixcsv/data/getdata.py b/test_fixcsv/data/getdata.py
new file mode 100644
index 0000000..fa2d628
--- /dev/null
+++ b/test_fixcsv/data/getdata.py
@@ -0,0 +1,7 @@
+import os
+
+FILEPATH = os.path.dirname(os.path.abspath(__file__))
+
+
+def get_test_data():
+    return os.path.join(FILEPATH, "testdata.csv")
diff --git a/test_fixcsv/data/testdata.csv b/test_fixcsv/data/testdata.csv
new file mode 100644
index 0000000..ff58177
--- /dev/null
+++ b/test_fixcsv/data/testdata.csv
@@ -0,0 +1,9 @@
+test
+test
+test
+test
+tset
+terst
+test
+test
+test
\ No newline at end of file
diff --git a/test/manual_test.py b/test_fixcsv/manual_test.py
similarity index 87%
rename from test/manual_test.py
rename to test_fixcsv/manual_test.py
index 21cbc39..5eeaf63 100644
--- a/test/manual_test.py
+++ b/test_fixcsv/manual_test.py
@@ -2,11 +2,11 @@
 import os
 
 sys.path.append(os.path.abspath(sys.path[0]) + "/../")
-from fix_csv import FixCSV
+from fix_csv.fix_csv import FixCSV
 
 
 def main():
-    fcsv = FixCSV(autofix=False)
+    fcsv = FixCSV(auto_fix=False)
     possible_values = [
         "ACGA",
         "ACGA_FU2",
diff --git a/test_fixcsv/test_csv.py b/test_fixcsv/test_csv.py
new file mode 100644
index 0000000..cd80231
--- /dev/null
+++ b/test_fixcsv/test_csv.py
@@ -0,0 +1,13 @@
+import unittest
+from test_fixcsv.data.getdata import get_test_data
+from fix_csv.fix_csv import FixCSV
+
+
+class TestCSV(unittest.TestCase):
+    def test_read_csv(self):
+        data_path = get_test_data()
+        fc = FixCSV(csv_path=data_path)
+        fc.set_possible_values(["test"])
+        vals = fc.fix_csv_column(0)
+        self.assertEqual("test", vals.pop())
+
diff --git a/test/test_fix_csv.py b/test_fixcsv/test_fix_csv.py
similarity index 81%
rename from test/test_fix_csv.py
rename to test_fixcsv/test_fix_csv.py
index b9080b0..0e4345c 100644
--- a/test/test_fix_csv.py
+++ b/test_fixcsv/test_fix_csv.py
@@ -1,13 +1,10 @@
 import unittest
-import sys, os
-
-sys.path.append(os.path.abspath(sys.path[0]) + "/../")
-from fix_csv import FixCSV
+from fix_csv.fix_csv import FixCSV
 
 
 class TestFixCSV(unittest.TestCase):
     def setUp(self):
-        self.fcsv = FixCSV(autofix=True, quiet=True)
+        self.fcsv = FixCSV(auto_fix=True, quiet=True)
 
     def test_add_possible_value(self):
         self.fcsv.add_possible_value("possible")
@@ -34,7 +31,7 @@ def test_dkefs_fake_data(self):
             "CTGV3",
         ]
         self.fcsv.set_possible_values(possible_values)
-        fixed_column = self.fcsv.fix_csv_column("DKEFSFAKEDATA.csv", 9)
+        fixed_column = self.fcsv.fix_csv_column(9, "data/DKEFSFAKEDATA.csv")
         self.assertLess(len(set(fixed_column)), len(possible_values))