-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtest_validation.py
More file actions
executable file
·117 lines (89 loc) · 3.86 KB
/
test_validation.py
File metadata and controls
executable file
·117 lines (89 loc) · 3.86 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
#!/usr/bin/env python3
"""
Validation script to ensure Python and Cython implementations produce identical results.
"""
import pandas as pd
import numpy as np
import python_impl
import cython_impl
def test_dataframe_equality():
"""Test that both implementations generate identical DataFrames."""
print("Testing DataFrame generation...")
# Generate small DataFrames for comparison
num_rows = 1000
py_df = python_impl.generate_dataframe(num_rows)
cy_df = cython_impl.generate_dataframe_cython(num_rows)
# Check shapes
assert py_df.shape == cy_df.shape, f"Shape mismatch: {py_df.shape} vs {cy_df.shape}"
print(f" ✓ Shapes match: {py_df.shape}")
# Check column names
assert list(py_df.columns) == list(cy_df.columns), "Column names don't match"
print(f" ✓ Column names match: {list(py_df.columns)}")
# Check dtypes
for col in py_df.columns:
if col != 'category': # Skip string columns which might have different representations
assert py_df[col].dtype == cy_df[col].dtype, f"dtype mismatch for {col}"
print(" ✓ Data types match")
# Check values (with tolerance for float precision)
for col in py_df.columns:
if col in ['value1', 'value2']:
# Use numpy's allclose for float comparison
assert np.allclose(py_df[col].values, cy_df[col].values), f"Values don't match for {col}"
elif col != 'category':
# Exact comparison for integers and booleans
assert (py_df[col].values == cy_df[col].values).all(), f"Values don't match for {col}"
print(" ✓ All numeric values match")
# Check string values
assert (py_df['category'].values == cy_df['category'].values).all(), "Category values don't match"
print(" ✓ String values match")
print("\n✅ All tests passed! Both implementations produce identical results.")
def test_parquet_files():
"""Test that parquet files can be read back correctly."""
print("\nTesting parquet file I/O...")
num_rows = 1000
# Generate and write using Python implementation
py_df = python_impl.generate_dataframe(num_rows)
python_impl.write_parquet(py_df, 'test_python.parquet')
# Generate and write using Cython implementation
cy_df = cython_impl.generate_dataframe_cython(num_rows)
cython_impl.write_parquet_cython(cy_df, 'test_cython.parquet')
# Read back the files
py_df_read = pd.read_parquet('test_python.parquet')
cy_df_read = pd.read_parquet('test_cython.parquet')
print(f" ✓ Python parquet file: {py_df_read.shape}")
print(f" ✓ Cython parquet file: {cy_df_read.shape}")
# Compare values from read files
for col in ['id', 'value1', 'value2', 'flag']:
if col in ['value1', 'value2']:
assert np.allclose(py_df_read[col].values, cy_df_read[col].values), f"Read values don't match for {col}"
else:
assert (py_df_read[col].values == cy_df_read[col].values).all(), f"Read values don't match for {col}"
print(" ✓ Parquet files contain identical data")
# Cleanup
import os
os.remove('test_python.parquet')
os.remove('test_cython.parquet')
print(" ✓ Test files cleaned up")
print("\n✅ Parquet I/O tests passed!")
def main():
"""Run all validation tests."""
print("=" * 80)
print("Validation Tests for Python vs Cython Implementations")
print("=" * 80)
print()
try:
test_dataframe_equality()
test_parquet_files()
print()
print("=" * 80)
print("ALL VALIDATION TESTS PASSED!")
print("=" * 80)
return 0
except AssertionError as e:
print(f"\n❌ Test failed: {e}")
return 1
except Exception as e:
print(f"\n❌ Error: {e}")
return 1
if __name__ == '__main__':
exit(main())