api-documentation-quality-research/fixed_code_execution_engine.py at master · harrymower/api-documentation-quality-research · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
#!/usr/bin/env python3
"""
Fixed Code Execution Engine
Properly handles markdown formatting and code extraction from LLM responses
"""

import asyncio
import re
import ast
import os
import subprocess
import tempfile
import time
from typing import Dict, Tuple, Optional
import logging

# Load environment variables from .env file
try:
    from dotenv import load_dotenv
    load_dotenv()
except ImportError:
    pass  # dotenv not available
except Exception:
    pass  # ignore loading errors

logger = logging.getLogger(__name__)

class FixedCodeExecutionEngine:
    """Fixed code execution engine with proper markdown handling"""

    def __init__(self):
        self.temp_dir = tempfile.mkdtemp()
        logger.info(f"Fixed code execution engine initialized with temp dir: {self.temp_dir}")

    def extract_python_code(self, llm_output: str) -> str:
        """Extract clean Python code from LLM output with markdown formatting"""

        logger.info(f"Extracting Python code from {len(llm_output)} character LLM output")

        # Step 1: Remove markdown code blocks
        code = llm_output

        # Remove opening ```python or ``` markers
        code = re.sub(r'^```python\s*\n?', '', code, flags=re.MULTILINE)
        code = re.sub(r'^```\s*\n?', '', code, flags=re.MULTILINE)

        # Remove closing ``` markers and everything after
        code = re.sub(r'\n```.*$', '', code, flags=re.MULTILINE | re.DOTALL)

        # Step 2: Remove explanatory text after code
        # Look for common patterns that indicate end of code
        patterns = [
            r'\nThis Python code.*$',
            r'\nThe above code.*$',
            r'\nExplanation:.*$',
            r'\nNote:.*$',
            r'\nThis script.*$'
        ]

        for pattern in patterns:
            code = re.sub(pattern, '', code, flags=re.MULTILINE | re.DOTALL)

        # Step 3: Clean up whitespace
        code = code.strip()

        logger.info(f"Extracted {len(code)} characters of clean Python code")

        # Step 4: Log the extraction for debugging
        logger.debug(f"Original LLM output:\n{llm_output[:500]}...")
        logger.debug(f"Extracted code:\n{code[:500]}...")

        return code

    def validate_python_syntax(self, code: str) -> Tuple[bool, Optional[str]]:
        """Validate Python syntax before execution"""

        try:
            ast.parse(code)
            logger.info("✅ Code syntax validation passed")
            return True, None
        except SyntaxError as e:
            error_msg = f"Syntax error on line {e.lineno}: {e.msg}"
            if e.text:
                error_msg += f"\n  Code: {e.text.strip()}"
                error_msg += f"\n  Position: {' ' * (e.offset - 1)}^"
            logger.error(f"❌ Syntax validation failed: {error_msg}")
            return False, error_msg
        except Exception as e:
            error_msg = f"Code parsing error: {str(e)}"
            logger.error(f"❌ Code parsing failed: {error_msg}")
            return False, error_msg

    def analyze_code_quality(self, code: str) -> Dict[str, any]:
        """Analyze code quality metrics"""

        quality_metrics = {
            'has_error_handling': False,
            'has_authentication': False,
            'has_proper_imports': False,
            'complexity_score': 0,
            'line_count': len(code.split('\n')),
            'has_main_execution': False
        }

        code_lower = code.lower()

        # Check for error handling
        if any(keyword in code_lower for keyword in ['try:', 'except', 'raise', 'error']):
            quality_metrics['has_error_handling'] = True

        # Check for authentication
        if any(keyword in code_lower for keyword in ['api_key', 'token', 'auth', 'key']):
            quality_metrics['has_authentication'] = True

        # Check for proper imports
        if any(keyword in code for keyword in ['import ', 'from ']):
            quality_metrics['has_proper_imports'] = True

        # Check for main execution
        if any(keyword in code for keyword in ['if __name__', 'print(', 'def ', 'class ']):
            quality_metrics['has_main_execution'] = True

        # Calculate complexity score
        score = 0
        if quality_metrics['has_proper_imports']:
            score += 20
        if quality_metrics['has_authentication']:
            score += 25
        if quality_metrics['has_error_handling']:
            score += 25
        if quality_metrics['has_main_execution']:
            score += 20
        if quality_metrics['line_count'] >= 10:
            score += 10

        quality_metrics['complexity_score'] = score

        logger.info(f"Code quality analysis: {score}/100 points")

        return quality_metrics

    async def execute_code(self, api_name: str, llm_output: str, env_key: str) -> Tuple[str, int, Optional[int], Optional[str], Dict]:
        """Execute code with proper markdown handling and error reporting"""

        start_time = time.time()

        logger.info(f"Executing code for {api_name}")

        # Step 1: Extract clean Python code
        try:
            clean_code = self.extract_python_code(llm_output)

            if not clean_code:
                return "extraction_failed", 0, None, "No Python code found in LLM output", {}

        except Exception as e:
            return "extraction_failed", 0, None, f"Code extraction failed: {str(e)}", {}

        # Step 2: Validate syntax
        syntax_valid, syntax_error = self.validate_python_syntax(clean_code)

        if not syntax_valid:
            return "syntax_error", int((time.time() - start_time) * 1000), None, syntax_error, {}

        # Step 3: Analyze code quality
        quality_metrics = self.analyze_code_quality(clean_code)

        # Step 4: Execute the code
        try:
            # Create temporary file
            temp_file = os.path.join(self.temp_dir, f"{api_name}_test.py")

            with open(temp_file, 'w') as f:
                f.write(clean_code)

            logger.info(f"Created temporary file: {temp_file}")

            # Set up environment
            env = os.environ.copy()

            # Check if API key is available
            if env_key not in env:
                logger.warning(f"Environment variable {env_key} not found")
                return "missing_api_key", int((time.time() - start_time) * 1000), None, f"Missing environment variable: {env_key}", quality_metrics

            # Execute the code
            logger.info(f"Executing Python code for {api_name}")

            result = subprocess.run(
                ['python', temp_file],
                capture_output=True,
                text=True,
                timeout=30,
                env=env
            )

            execution_time = int((time.time() - start_time) * 1000)

            # Analyze execution results
            if result.returncode == 0:
                logger.info(f"✅ Code executed successfully for {api_name}")

                # Check if output indicates API success
                output = result.stdout.lower()

                if any(success_indicator in output for success_indicator in ['success', '200', 'ok', '{']):
                    return "full_success", execution_time, 200, None, quality_metrics
                else:
                    return "execution_success", execution_time, None, "Code ran but no clear API success", quality_metrics

            else:
                logger.error(f"❌ Code execution failed for {api_name}")
                logger.error(f"Return code: {result.returncode}")
                logger.error(f"STDOUT: {result.stdout}")
                logger.error(f"STDERR: {result.stderr}")

                # Analyze the error
                stderr = result.stderr.lower()

                if any(auth_error in stderr for auth_error in ['401', 'unauthorized', 'invalid api key', 'forbidden']):
                    return "auth_error", execution_time, 401, result.stderr, quality_metrics
                elif any(api_error in stderr for api_error in ['404', 'not found', 'bad request', '400']):
                    return "api_error", execution_time, 404, result.stderr, quality_metrics
                elif any(network_error in stderr for network_error in ['connection', 'timeout', 'network']):
                    return "network_error", execution_time, None, result.stderr, quality_metrics
                else:
                    return "runtime_error", execution_time, None, result.stderr, quality_metrics

        except subprocess.TimeoutExpired:
            return "timeout", int((time.time() - start_time) * 1000), None, "Code execution timed out after 30 seconds", quality_metrics

        except Exception as e:
            return "execution_failed", int((time.time() - start_time) * 1000), None, f"Execution error: {str(e)}", quality_metrics

        finally:
            # Clean up temporary file
            try:
                if 'temp_file' in locals() and os.path.exists(temp_file):
                    os.remove(temp_file)
            except Exception as e:
                logger.warning(f"Failed to clean up temp file: {e}")

# Test function
async def test_fixed_execution_engine():
    """Test the fixed execution engine with the OpenWeatherMap example"""

    # Sample LLM output with markdown formatting (the problematic case)
    sample_llm_output = '''```python
import os
import requests
from requests.exceptions import HTTPError

# Get the API key from environment variable
api_key = os.getenv('OPENWEATHER_API_KEY')

# Base URL for the OpenWeatherMap API
base_url = "https://api.openweathermap.org/data/3.0/onecall"

def get_weather_data(lat, lon):
    params = {
        'lat': lat,
        'lon': lon,
        'appid': api_key
    }

    try:
        response = requests.get(base_url, params=params)
        response.raise_for_status()
        print("Success!")
        return response.json()
    except Exception as e:
        print(f"Error: {e}")
        return None

# Test the function
print("Testing OpenWeatherMap API integration")
result = get_weather_data(33.44, -94.04)
if result:
    print(f"Weather data retrieved successfully")
```
This Python code integrates with the OpenWeatherMap API to get weather data.'''

    engine = FixedCodeExecutionEngine()

    print("Testing Fixed Code Execution Engine")
    print("=" * 60)

    # Test code extraction
    clean_code = engine.extract_python_code(sample_llm_output)
    print(f"\n1. Code Extraction:")
    print(f"   Original length: {len(sample_llm_output)}")
    print(f"   Extracted length: {len(clean_code)}")
    print(f"   First line: {clean_code.split()[0] if clean_code.split() else 'Empty'}")

    # Test syntax validation
    syntax_valid, syntax_error = engine.validate_python_syntax(clean_code)
    print(f"\n2. Syntax Validation:")
    print(f"   Valid: {syntax_valid}")
    if syntax_error:
        print(f"   Error: {syntax_error}")

    # Test quality analysis
    quality = engine.analyze_code_quality(clean_code)
    print(f"\n3. Quality Analysis:")
    for key, value in quality.items():
        print(f"   {key}: {value}")

    # Test execution (if syntax is valid)
    if syntax_valid:
        print(f"\n4. Code Execution Test:")
        status, time_ms, response_code, error_msg, metrics = await engine.execute_code(
            "OpenWeatherMap", sample_llm_output, "OPENWEATHER_API_KEY"
        )
        print(f"   Status: {status}")
        print(f"   Time: {time_ms}ms")
        print(f"   Response Code: {response_code}")
        print(f"   Error: {error_msg}")

    print("\n" + "=" * 60)

if __name__ == "__main__":
    asyncio.run(test_fixed_execution_engine())