-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathCSV-Extractor.cpp
More file actions
144 lines (137 loc) · 5.77 KB
/
CSV-Extractor.cpp
File metadata and controls
144 lines (137 loc) · 5.77 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
#include <vector>
#include <string>
#include <fstream>
using namespace std;
class csvExtraction {
public:
// Function to extract data from a CSV file and store it in a vector of vectors of strings
// Returns a vector of vectors of -
/*
0 - Title
1 - Type
2 - vector of Actors/Directors
3 - Rating
4 - Duration
5 - Genre
*/
vector<vector<vector<string>>> extractData() {
return extractDataHelper("../amazon_prime_titles.csv");
}
private:
vector<vector<vector<string>>> extractDataHelper(const string& filename) {
// Read File
// Only take the data I need
vector<vector<vector<string>>> result;
ifstream file(filename);
string line;
string header;
getline(file,header); // Read the header line
if (header.empty()) {
// Handle empty file case
return result;
}
while (getline(file, line)) {
if (line[0] != 's') continue; // Skip the line if it doesn't start with a show_id
vector<vector<string>> row(6); // 6 columns
string cell;
size_t pos = 0;
int lastPos = 0;
// discard show_id
getCell(line, lastPos);
// Get type
row[1] = getCell(line, lastPos);
// Get title
row[0] = getCell(line, lastPos);
// Get actors/directors
// Get director first
row[2] = getCell(line, lastPos);
// Get the actors inside of the quotes that are seperated by commas
vector<string> temp = getCell(line, lastPos);
temp.insert(temp.begin(), row[2][0]); // Add the director to the front of the vector
row[2] = temp;
// Skip country
getCell(line, lastPos);
// Skip date_added
getCell(line, lastPos);
// Skip release_year
getCell(line, lastPos);
// Get rating
row[3] = getCell(line, lastPos);
// Get duration
row[4] = getCell(line, lastPos);
// Get genres the same as actors
row[5] = getCell(line, lastPos);
// Skip description
getCell(line, lastPos);
result.push_back(row); // Add the row to the result
}
return result;
}
vector<string> getCell(const string& line, int& lastPos) {
vector<string> cells;
if (line.find('\"', lastPos + 1) == lastPos + 1) {
int pos = line.find('\"', lastPos + 1);
lastPos = pos;
pos = line.find('\"', lastPos + 1);
string cell = line.substr(lastPos + 1, pos - lastPos - 1);
int lastPos2 = 0;
int pos2 = 0;
pos2 = cell.find(',', lastPos2 + 1);
string curInd = cell.substr(lastPos2, pos2 - lastPos2);
cells.push_back(curInd);
lastPos2 = pos2 + 1;
while (cell.find(',', lastPos2+1) != string::npos) {
pos2 = cell.find(',', lastPos2 + 1);
// Remove the space at the start of these
string curInd = cell.substr(lastPos2+1, pos2 - lastPos2-1);
cells.push_back(curInd);
lastPos2 = pos2 + 1;
}
if (lastPos2 < cell.size()) {
curInd = cell.substr(lastPos2+1, cell.size() - lastPos2-1);
cells.push_back(curInd);
lastPos = pos + 1;
}
} else {
// If the cell is not in quotes, find the next comma
size_t pos = line.find(',', lastPos + 1);
string cell = line.substr(lastPos + 1, pos - lastPos - 1);
cells.push_back(cell);
lastPos = pos;
}
// Replacements that need to be checked
for (auto& cell : cells) {
if (cell == "Action") cell = "Action and adventure";
else if (cell == "Suspense") cell = "Mystery and thrillers";
else if (cell == "Science Fiction") cell = "Science fiction";
else if (cell == "UNRATED") cell = "18+";
else if (cell == "R") cell = "16+";
else if (cell == "PG-13") cell = "13+";
else if (cell == "PG") cell = "7+";
else if (cell == "NR") cell = "18+";
else if (cell == "NOT-RATE") cell = "18+";
else if (cell == "NC-17") cell = "16+";
else if (cell == "G") cell = "All";
else if (cell == "ALL") cell = "All";
else if (cell == "AGES_18_") cell = "18+";
else if (cell == "TV Show") cell = "Shows";
// TODO: Fix for time ranges
else if (cell.find("min") != string::npos) {
cell = cell.substr(0, cell.find("min")-1);
int temp = 0;
try {
temp = stoi(cell);
} catch (const std::invalid_argument& e) {
// Handle the case where conversion fails
cell = "Unknown";
continue;
}
if (temp < 60) cell = "<1hr";
else if (temp <= 90) cell = "1-1.5hrs";
else if (temp <= 120) cell = "1.5-2hrs";
else cell = ">2hrs";
}
}
return cells;
}
};