CP-compiler-version6/lexer.cpp at main · Akash35721/CP-compiler-version6 · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
#include <iostream>
#include <fstream>
#include <sstream>
#include <string>
#include <vector>
#include <cctype>
#include <unordered_set>


// Token types:
// KT: Keyword Token (non-data type)
// OT: Operator Token
// DT: Data Type Token (e.g. int, float)
// CT: Constant Token (numeric, string, or char constants)
// ScT: Separator Token (punctuation, braces, etc.)
// IT: Identifier Token (names that aren't keywords)
enum TokenType {
    KT,
    OT,
    DT,
    CT,
    ScT,
    IT
};

struct Token {
    TokenType type;
    std::string lexeme;
};

// Define reserved words.
std::unordered_set<std::string> dataTypeKeywords = {"int", "float", "char", "bool", "double", "void","string"};
std::unordered_set<std::string> otherKeywords = {"if", "else", "while", "for", "return", "switch", "case", "default", "break", "continue"};

// Check if a character is part of an operator.
bool isOperatorChar(char c) {
    std::string opChars = "+-*/=%&|^!<>"; // basic operator characters
    return opChars.find(c) != std::string::npos;
}

// Check if a character is a separator.
bool isSeparatorChar(char c) {
    std::string sepChars = "();,{}[]";
    return sepChars.find(c) != std::string::npos;
}

std::vector<Token> tokenize(const std::string &input) {
    std::vector<Token> tokens;
    size_t i = 0;
    size_t n = input.size();

    while (i < n) {
        char c = input[i];


        // Inside your tokenize() loop, add this block near the beginning:
     if (c == '/') {
    // Check for single-line comment: //
    if (i + 1 < n && input[i + 1] == '/') {
        i += 2; // skip the "//"
        while (i < n && input[i] != '\n') {
            i++; // skip until end-of-line
        }
        continue; // proceed to next character after the comment
    }
    // Check for block comment: /* ... */
    if (i + 1 < n && input[i + 1] == '*') {
        i += 2; // skip the "/*"
        while (i + 1 < n && !(input[i] == '*' && input[i + 1] == '/')) {
            i++; // skip the comment content
        }
        if (i + 1 < n) {
            i += 2; // skip the closing "*/"
        } else {
            std::cerr << "Error: Block comment not terminated properly." << std::endl;
        }
        continue;
      }
    }


        // Skip whitespace.
        if (std::isspace(c)) {
            i++;
            continue;
        }

        // Identifiers or keywords (starting with letter or underscore).
        if (std::isalpha(c) || c == '_') {
            std::string lexeme;
            while (i < n && (std::isalnum(input[i]) || input[i] == '_')) {
                lexeme.push_back(input[i]);
                i++;
            }
            // Check if the lexeme is a reserved keyword.
            if (dataTypeKeywords.find(lexeme) != dataTypeKeywords.end()) {
                tokens.push_back({DT, lexeme});
            } else if (otherKeywords.find(lexeme) != otherKeywords.end()) {
                tokens.push_back({KT, lexeme});
            } else {
                tokens.push_back({IT, lexeme});
            }
            continue;
        }

        // Numeric constants (CT).
        if (std::isdigit(c)) {
            std::string lexeme;
            while (i < n && std::isdigit(input[i])) {
                lexeme.push_back(input[i]);
                i++;
            }
            // Optional: Handle floating-point numbers.
            if (i < n && input[i] == '.') {
                lexeme.push_back(input[i]);
                i++;
                while (i < n && std::isdigit(input[i])) {
                    lexeme.push_back(input[i]);
                    i++;
                }
            }
            tokens.push_back({CT, lexeme});
            continue;
        }

 // String literals (CT) enclosed in double quotes
if (c == '"') {
    std::string lexeme;
    lexeme.push_back(c); // Opening quote
    i++;

    while (i < n && input[i] != '"') {
        lexeme.push_back(input[i]);
        i++;
    }

    if (i < n) { // Closing quote found
        lexeme.push_back(input[i]);
        i++;
    } else {
        std::cerr << "Error: Missing closing double quote for string literal!" << std::endl;
    }

    // Store lexeme with quotes included
    tokens.push_back({CT, lexeme});
    continue;
}


        // Character literals (CT) enclosed in single quotes.
        if (c == '\'') {
            std::string lexeme;
            lexeme.push_back(c); // opening quote
            i++;

            // Ensure there's at least one character inside the quotes
            if (i < n && input[i] != '\'') {
                lexeme.push_back(input[i]); // actual character
                i++;
            } else {
                std::cerr << "Error: Empty character literal detected!" << std::endl;
            }

            // Ensure it properly closes with another single quote
            if (i < n && input[i] == '\'') {
                lexeme.push_back(input[i]); // closing quote
                i++;
            } else {
                std::cerr << "Error: Missing closing single quote for character literal!" << std::endl;
            }

            // Ensure the character literal contains exactly one character
            if (lexeme.length() == 3) {
                tokens.push_back({CT, lexeme});
            } else {
                std::cerr << "Error: Invalid character literal: " << lexeme << std::endl;
            }

            continue;
        }

        // Operators (OT).
        if (isOperatorChar(c)) {
            std::string lexeme;
            lexeme.push_back(c);
            // Check for common two-character operators.
            if (i + 1 < n && isOperatorChar(input[i+1])) {
                std::string twoChar = lexeme + input[i+1];
                if (twoChar == "==" || twoChar == "<=" || twoChar == ">=" ||
                    twoChar == "!=" || twoChar == "++" || twoChar == "--" ||
                    twoChar == "&&" || twoChar == "||") {
                    lexeme = twoChar;
                    i += 2;
                    tokens.push_back({OT, lexeme});
                    continue;
                }
            }
            i++;
            tokens.push_back({OT, lexeme});
            continue;
        }

        // Separators (ScT).
        if (isSeparatorChar(c)) {
            std::string lexeme(1, c);
            tokens.push_back({ScT, lexeme});
            i++;
            continue;
        }

        // If character does not match any rule, skip it.
        i++;
    }

    return tokens;
}

std::string tokenTypeToString(TokenType type) {
    switch(type) {
        case KT:   return "KT";
        case OT:   return "OT";
        case DT:   return "DT";
        case CT:   return "CT";
        case ScT:  return "ST";
        case IT:   return "IT";
        default:   return "Unknown";
    }
}
// //--------------------------------------------------------------------
// // Returns a longer description for each token type.
// std::string tokenTypeDescription(TokenType type) {
//     switch (type) {
//         case KT:  return "Keyword Token (non-data type, e.g. if, else, for)";
//         case OT:  return "Operator Token (e.g. +, -, ==)";
//         case DT:  return "Data Type Token (e.g. int, float, char)";
//         case CT:  return "Constant Token (numeric, string, or char constants)";
//         case ScT: return "Separator Token (punctuation, braces, semicolons, etc.)";
//         case IT:  return "Identifier Token (names for variables or functions)";
//         default:  return "Unknown Token";
//     }
// }

// // Function to print detailed token information on the terminal.
// void printDetailedTokens(const std::vector<Token>& tokens) {
//     std::cout << "=== Detailed Token Analysis ===\n\n";
//     for (const auto &token : tokens) {
//         std::cout << "Token: \"" << token.lexeme << "\"\n";
//         std::cout << "Type: " << tokenTypeToString(token.type)
//                   << " - " << tokenTypeDescription(token.type) << "\n";
//         std::cout << "---------------------------\n";
//     }
//     std::cout << "Total Tokens: " << tokens.size() << "\n";
// }
// //--------------------------------------------------------------------
int main() {
    // Open the input file.
    std::ifstream inFile("lexerinput.txt");
    if (!inFile) {
        std::cerr << "Unable to open input file" << std::endl;
        return 1;
    }

    // Read the entire file into a string.
    std::stringstream buffer;
    buffer << inFile.rdbuf();
    std::string input = buffer.str();
    inFile.close();

    // Tokenize the input.
    std::vector<Token> tokens = tokenize(input);

    // Open the output file.
    std::ofstream outFile("lexeroutput.txt"); // old way a new content each  time
   //std::ofstream outFile("output.txt", std::ios::app);

    if (!outFile) {
        std::cerr << "Unable to open output file" << std::endl;
        return 1;
    }

// //--------------------------------------------------------------------

//     outFile << "\n\n========== NEW INPUT ==========\n\n";


// //--------------------------------------------------------------------
    // Write tokens to the output file.
    for (const auto &token : tokens) {
        outFile << tokenTypeToString(token.type) << ":" << token.lexeme << "\n";
    }
    outFile.close();


    // Print a detailed, explanatory version to the terminal.
   // printDetailedTokens(tokens);


    return 0;
}