-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathSQL Query for data cleaning(Transaction).sql
More file actions
111 lines (95 loc) · 2.97 KB
/
SQL Query for data cleaning(Transaction).sql
File metadata and controls
111 lines (95 loc) · 2.97 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
--Cleaning Data in SQL Querys
----------------------------------
USE Storedb
--Creating a Cleaned Table
-----------------------------------------------------------------------------
CREATE TABLE Cleaned_Transaction( --(Using this because Select into is blocked)
Customer_ID INT,
Name VARCHAR(100),
Transaction_ID VARCHAR(20),
Product VARCHAR(100),
Purchase_Date VARCHAR(50), --(Because it's varchar in original table)
Quantity INT,
Price INT,
Total INT,
Status VARCHAR(50));
INSERT INTO Cleaned_Transaction
SELECT * FROM Transactions;
--Standardizeing Date Format
--------------------------------------
ALTER TABLE Cleaned_Transaction
ADD Formatted_Purchase_Date VARCHAR(50);
UPDATE Cleaned_Transaction
SET Formatted_Purchase_Date =
CASE
WHEN ISDATE(Purchase_Date) =1
THEN FORMAT(CONVERT(Date,Purchase_Date),'dd/MM/yyyy')
ELSE NULL
END;
-- Fixing Status column
-----------------------------------
UPDATE Cleaned_Transaction
SET [Status] = CASE
WHEN [Status] IS NULL THEN
'Unknown'
WHEN LOWER([Status]) IN
('returned','return') THEN
'Returned'
WHEN LOWER([Status]) IN
('processing','process') THEN
'Processing'
WHEN LOWER([Status]) IN
('delivered','deliverd') THEN
'Delivered'
WHEN LOWER([Status]) IN
('cancelled','canceled') THEN
'Cancelled'
ELSE [Status]
END;
-- Fixing Product Name or "Capitalization"
--------------------------------------
UPDATE Cleaned_Transaction
SET [Product] =
CASE
WHEN CHARINDEX(' ', LTRIM([Product])) > 0 THEN
CONCAT(
UPPER(LEFT(LTRIM([Product]), 1)),
LOWER(SUBSTRING(LTRIM([Product]), 2, CHARINDEX(' ', LTRIM([Product])) - 1)),
' ',
UPPER(SUBSTRING(LTRIM([Product]), CHARINDEX(' ', LTRIM([Product])) + 1, 1)),
LOWER(SUBSTRING(
LTRIM([Product]),
CHARINDEX(' ', LTRIM([Product])) + 2,
LEN(LTRIM([Product]))
))
)
ELSE
UPPER(LEFT(LTRIM([Product]), 1)) + LOWER(SUBSTRING(LTRIM([Product]), 2, LEN([Product])))
END
WHERE [Product] IS NOT NULL;
-- Calculate and Fix Missing Totals
-------------------------------------------
UPDATE Cleaned_Transaction
SET Total = Quantity * Price
WHERE Total IS NULL AND Quantity IS NOT NULL AND Price IS NOT NULL;
--Handling Missing Data
----------------------------------------
DELETE FROM Cleaned_Transaction
WHERE Product IS NULL
OR Quantity IS NULL
OR Price IS NULL
OR Total IS NULL
OR Formatted_Purchase_Date IS NULL
-- I am removing all nulls without care because this table is for showing trends.
-- Removing Duplicates Rows
With RowCTE as (
select * , ROW_NUMBER() OVER (
PARTITION BY Customer_ID, Transaction_ID, Product, Purchase_date
ORDER BY (SELECT NULL))
AS rn
FROM Cleaned_Transaction
)
DELETE FROM RowCTE WHERE rn > 1;
--Final cleaned table
SELECT*
FROM Cleaned_Transaction