bench_python_c/test_validation.py at main · nasirus/bench_python_c · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
#!/usr/bin/env python3
"""
Validation script to ensure Python and Cython implementations produce identical results.
"""
import pandas as pd
import numpy as np
import python_impl
import cython_impl


def test_dataframe_equality():
    """Test that both implementations generate identical DataFrames."""
    print("Testing DataFrame generation...")

    # Generate small DataFrames for comparison
    num_rows = 1000

    py_df = python_impl.generate_dataframe(num_rows)
    cy_df = cython_impl.generate_dataframe_cython(num_rows)

    # Check shapes
    assert py_df.shape == cy_df.shape, f"Shape mismatch: {py_df.shape} vs {cy_df.shape}"
    print(f"  ✓ Shapes match: {py_df.shape}")

    # Check column names
    assert list(py_df.columns) == list(cy_df.columns), "Column names don't match"
    print(f"  ✓ Column names match: {list(py_df.columns)}")

    # Check dtypes
    for col in py_df.columns:
        if col != 'category':  # Skip string columns which might have different representations
            assert py_df[col].dtype == cy_df[col].dtype, f"dtype mismatch for {col}"
    print("  ✓ Data types match")

    # Check values (with tolerance for float precision)
    for col in py_df.columns:
        if col in ['value1', 'value2']:
            # Use numpy's allclose for float comparison
            assert np.allclose(py_df[col].values, cy_df[col].values), f"Values don't match for {col}"
        elif col != 'category':
            # Exact comparison for integers and booleans
            assert (py_df[col].values == cy_df[col].values).all(), f"Values don't match for {col}"
    print("  ✓ All numeric values match")

    # Check string values
    assert (py_df['category'].values == cy_df['category'].values).all(), "Category values don't match"
    print("  ✓ String values match")

    print("\n✅ All tests passed! Both implementations produce identical results.")


def test_parquet_files():
    """Test that parquet files can be read back correctly."""
    print("\nTesting parquet file I/O...")

    num_rows = 1000

    # Generate and write using Python implementation
    py_df = python_impl.generate_dataframe(num_rows)
    python_impl.write_parquet(py_df, 'test_python.parquet')

    # Generate and write using Cython implementation
    cy_df = cython_impl.generate_dataframe_cython(num_rows)
    cython_impl.write_parquet_cython(cy_df, 'test_cython.parquet')

    # Read back the files
    py_df_read = pd.read_parquet('test_python.parquet')
    cy_df_read = pd.read_parquet('test_cython.parquet')

    print(f"  ✓ Python parquet file: {py_df_read.shape}")
    print(f"  ✓ Cython parquet file: {cy_df_read.shape}")

    # Compare values from read files
    for col in ['id', 'value1', 'value2', 'flag']:
        if col in ['value1', 'value2']:
            assert np.allclose(py_df_read[col].values, cy_df_read[col].values), f"Read values don't match for {col}"
        else:
            assert (py_df_read[col].values == cy_df_read[col].values).all(), f"Read values don't match for {col}"

    print("  ✓ Parquet files contain identical data")

    # Cleanup
    import os
    os.remove('test_python.parquet')
    os.remove('test_cython.parquet')
    print("  ✓ Test files cleaned up")

    print("\n✅ Parquet I/O tests passed!")


def main():
    """Run all validation tests."""
    print("=" * 80)
    print("Validation Tests for Python vs Cython Implementations")
    print("=" * 80)
    print()

    try:
        test_dataframe_equality()
        test_parquet_files()

        print()
        print("=" * 80)
        print("ALL VALIDATION TESTS PASSED!")
        print("=" * 80)
        return 0

    except AssertionError as e:
        print(f"\n❌ Test failed: {e}")
        return 1
    except Exception as e:
        print(f"\n❌ Error: {e}")
        return 1


if __name__ == '__main__':
    exit(main())