-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathpython_impl.py
More file actions
61 lines (47 loc) · 1.78 KB
/
python_impl.py
File metadata and controls
61 lines (47 loc) · 1.78 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
"""Pure Python implementation for DataFrame generation and parquet writing."""
import pandas as pd
import numpy as np
from typing import Tuple
def generate_dataframe(num_rows: int = 1_000_000) -> pd.DataFrame:
"""
Generate a pandas DataFrame with random data.
Args:
num_rows: Number of rows to generate (default: 1 million)
Returns:
pandas DataFrame with multiple columns of different types
"""
data = {
'id': list(range(num_rows)),
'value1': [float(i * 2.5) for i in range(num_rows)],
'value2': [float(i ** 0.5) for i in range(num_rows)],
'category': [f'cat_{i % 10}' for i in range(num_rows)],
'flag': [bool(i % 2) for i in range(num_rows)],
}
return pd.DataFrame(data)
def write_parquet(df: pd.DataFrame, filename: str) -> None:
"""
Write DataFrame to parquet file with snappy compression.
Args:
df: DataFrame to write
filename: Output parquet file path
"""
df.to_parquet(filename, compression='snappy', engine='pyarrow', index=False)
def run_benchmark(num_rows: int = 1_000_000, output_file: str = 'output_python.parquet') -> Tuple[float, float]:
"""
Run the complete benchmark: generate DataFrame and write to parquet.
Args:
num_rows: Number of rows to generate
output_file: Output parquet file path
Returns:
Tuple of (generation_time, write_time) in seconds
"""
import time
# Time DataFrame generation
start = time.perf_counter()
df = generate_dataframe(num_rows)
gen_time = time.perf_counter() - start
# Time parquet writing
start = time.perf_counter()
write_parquet(df, output_file)
write_time = time.perf_counter() - start
return gen_time, write_time