-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathverify_tutorials.py
More file actions
118 lines (95 loc) · 3.59 KB
/
verify_tutorials.py
File metadata and controls
118 lines (95 loc) · 3.59 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
#!/usr/bin/env python3
"""
Verification Script for Spark Tutorials
======================================
This script verifies that all tutorial files are present and have valid Python syntax.
It doesn't require Spark to be installed.
To run: python verify_tutorials.py
"""
import os
import ast
import sys
def verify_file(filename):
"""Verify a Python file exists and has valid syntax."""
print(f"\nVerifying {filename}...")
# Check if file exists
if not os.path.exists(filename):
print(f" ✗ File not found: {filename}")
return False
# Check file size
file_size = os.path.getsize(filename)
print(f" ✓ File exists ({file_size:,} bytes)")
# Check Python syntax
try:
with open(filename, 'r') as f:
content = f.read()
ast.parse(content)
print(f" ✓ Valid Python syntax")
# Count lines
lines = content.split('\n')
code_lines = [l for l in lines if l.strip() and not l.strip().startswith('#')]
print(f" ✓ {len(lines)} total lines, {len(code_lines)} code lines")
# Check for key Spark imports
spark_imports = ['pyspark', 'SparkSession', 'SparkContext']
found_imports = []
for imp in spark_imports:
if imp in content:
found_imports.append(imp)
if found_imports:
print(f" ✓ Found Spark imports: {', '.join(found_imports)}")
return True
except SyntaxError as e:
print(f" ✗ Syntax error: {e}")
return False
except Exception as e:
print(f" ✗ Error reading file: {e}")
return False
def main():
"""Main verification function."""
print("=" * 60)
print("SPARK TUTORIALS VERIFICATION")
print("=" * 60)
# List of files to verify
files = [
("Tutorial 1: RDD Basics", "tutorial_1_spark_rdd_basics.py"),
("Tutorial 2: SQL and DataFrames", "tutorial_2_spark_sql_dataframes.py"),
("Tutorial 3: MLlib", "tutorial_3_spark_mllib.py"),
("Tutorial 4: ML Pipeline", "tutorial_4_spark_ml_pipeline.py"),
("Test Script", "test_all_tutorials.py"),
("README", "README.md")
]
results = []
# Verify each file
for name, filename in files:
print(f"\n{name}:")
if filename.endswith('.py'):
success = verify_file(filename)
else:
# Just check if non-Python files exist
success = os.path.exists(filename)
if success:
size = os.path.getsize(filename)
print(f" ✓ File exists ({size:,} bytes)")
else:
print(f" ✗ File not found")
results.append((name, success))
# Summary
print("\n" + "=" * 60)
print("SUMMARY")
print("=" * 60)
passed = sum(1 for _, success in results if success)
total = len(results)
for name, success in results:
status = "✓" if success else "✗"
print(f"{status} {name}")
print(f"\nTotal: {passed}/{total} files verified successfully")
if passed == total:
print("\n✅ All tutorial files are present and valid!")
print("\nTo run the tutorials, you'll need to:")
print("1. Install PySpark: pip install pyspark numpy pandas")
print("2. Run individual tutorials: python tutorial_1_spark_rdd_basics.py")
print("3. Or use spark-submit: spark-submit tutorial_1_spark_rdd_basics.py")
else:
print(f"\n⚠️ {total - passed} file(s) have issues")
if __name__ == "__main__":
main()